diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index a57652cb364..841a02f72e6 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -221,7 +221,8 @@ $ ./build.sh dask_cudf
 - To run Python tests (Optional):
 ```bash
 $ cd $CUDF_HOME/python
-$ py.test -v                           # run python tests on cudf and dask-cudf python bindings
+$ py.test -v cudf                           # run cudf test suite
+$ py.test -v dask_cudf                      # run dask_cudf test suite
 ```
 
 - Other `build.sh` options:
diff --git a/ci/checks/style.sh b/ci/checks/style.sh
index 981e886d31c..8235f9de0e5 100755
--- a/ci/checks/style.sh
+++ b/ci/checks/style.sh
@@ -11,7 +11,8 @@ LC_ALL=C.UTF-8
 LANG=C.UTF-8
 
 # Activate common conda env
-source activate gdf
+. /opt/conda/etc/profile.d/conda.sh
+conda activate rapids
 
 # Run isort and get results/return code
 ISORT=`isort --check-only python/**/*.py`
diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index 1d0154aedc7..355b18f4543 100755
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -201,8 +201,8 @@ fi
 ################################################################################
 
 # If examples grows too large to build, should move to cpu side
-gpuci_logger "Building libcudf examples"
-$WORKSPACE/cpp/examples/build.sh
+# gpuci_logger "Building libcudf examples"
+# $WORKSPACE/cpp/examples/build.sh
 
 # set environment variable for numpy 1.16
 # will be enabled for later versions by default
@@ -217,7 +217,7 @@ fi
 
 cd "$WORKSPACE/python/cudf"
 gpuci_logger "Python py.test for cuDF"
-py.test -n 6 --cache-clear --basetemp="$WORKSPACE/cudf-cuda-tmp" --junitxml="$WORKSPACE/junit-cudf.xml" -v --cov-config=.coveragerc --cov=cudf --cov-report=xml:"$WORKSPACE/python/cudf/cudf-coverage.xml" --cov-report term
+py.test -n 6 --cache-clear --basetemp="$WORKSPACE/cudf-cuda-tmp" --ignore="$WORKSPACE/python/cudf/cudf/benchmarks" --junitxml="$WORKSPACE/junit-cudf.xml" -v --cov-config=.coveragerc --cov=cudf --cov-report=xml:"$WORKSPACE/python/cudf/cudf-coverage.xml" --cov-report term
 
 cd "$WORKSPACE/python/dask_cudf"
 gpuci_logger "Python py.test for dask-cudf"
diff --git a/ci/gpu/java.sh b/ci/gpu/java.sh
new file mode 100755
index 00000000000..8c4b597d12d
--- /dev/null
+++ b/ci/gpu/java.sh
@@ -0,0 +1,145 @@
+#!/bin/bash
+# Copyright (c) 2018-2020, NVIDIA CORPORATION.
+##############################################
+# cuDF GPU build and test script for CI      #
+##############################################
+set -e
+NUMARGS=$#
+ARGS=$*
+
+# Arg parsing function
+function hasArg {
+    (( ${NUMARGS} != 0 )) && (echo " ${ARGS} " | grep -q " $1 ")
+}
+
+# Set path and build parallel level
+export PATH=/opt/conda/bin:/usr/local/cuda/bin:$PATH
+export PARALLEL_LEVEL=${PARALLEL_LEVEL:-4}
+
+# Set home to the job's workspace
+export HOME="$WORKSPACE"
+
+# Switch to project root; also root of repo checkout
+cd "$WORKSPACE"
+
+# Determine CUDA release version
+export CUDA_REL=${CUDA_VERSION%.*}
+export CONDA_ARTIFACT_PATH="$WORKSPACE/ci/artifacts/cudf/cpu/.conda-bld/"
+
+# Parse git describe
+export GIT_DESCRIBE_TAG=`git describe --tags`
+export MINOR_VERSION=`echo $GIT_DESCRIBE_TAG | grep -o -E '([0-9]+\.[0-9]+)'`
+
+################################################################################
+# TRAP - Setup trap for removing jitify cache
+################################################################################
+
+# Set `LIBCUDF_KERNEL_CACHE_PATH` environment variable to $HOME/.jitify-cache
+# because it's local to the container's virtual file system, and not shared with
+# other CI jobs like `/tmp` is
+export LIBCUDF_KERNEL_CACHE_PATH="$HOME/.jitify-cache"
+
+function remove_libcudf_kernel_cache_dir {
+    EXITCODE=$?
+    gpuci_logger "TRAP: Removing kernel cache dir: $LIBCUDF_KERNEL_CACHE_PATH"
+    rm -rf "$LIBCUDF_KERNEL_CACHE_PATH" \
+        || gpuci_logger "[ERROR] TRAP: Could not rm -rf $LIBCUDF_KERNEL_CACHE_PATH"
+    exit $EXITCODE
+}
+
+# Set trap to run on exit
+gpuci_logger "TRAP: Set trap to remove jitify cache on exit"
+trap remove_libcudf_kernel_cache_dir EXIT
+
+mkdir -p "$LIBCUDF_KERNEL_CACHE_PATH" \
+    || gpuci_logger "[ERROR] TRAP: Could not mkdir -p $LIBCUDF_KERNEL_CACHE_PATH"
+
+################################################################################
+# SETUP - Check environment
+################################################################################
+
+gpuci_logger "Check environment variables"
+env
+
+gpuci_logger "Check GPU usage"
+nvidia-smi
+
+gpuci_logger "Activate conda env"
+. /opt/conda/etc/profile.d/conda.sh
+conda activate rapids
+
+gpuci_logger "Check conda environment"
+conda info
+conda config --show-sources
+conda list --show-channel-urls
+
+gpuci_logger "Install dependencies"
+gpuci_conda_retry install -y \
+                  "cudatoolkit=$CUDA_REL" \
+                  "rapids-build-env=$MINOR_VERSION.*" \
+                  "rapids-notebook-env=$MINOR_VERSION.*" \
+                  "dask-cuda=${MINOR_VERSION}" \
+                  "rmm=$MINOR_VERSION.*" \
+                  "ucx-py=0.21.*" \
+                  "openjdk=8.*" \
+                  "maven"
+
+# https://docs.rapids.ai/maintainers/depmgmt/
+# gpuci_conda_retry remove --force rapids-build-env rapids-notebook-env
+# gpuci_conda_retry install -y "your-pkg=1.0.0"
+
+
+gpuci_logger "Check compiler versions"
+python --version
+$CC --version
+$CXX --version
+
+gpuci_logger "Check conda environment"
+conda info
+conda config --show-sources
+conda list --show-channel-urls
+
+function install_dask {
+    # Install the main version of dask, distributed, and streamz
+    gpuci_logger "Install the main version of dask, distributed, and streamz"
+    set -x
+    pip install "git+https://github.com/dask/distributed.git@main" --upgrade --no-deps
+    pip install "git+https://github.com/dask/dask.git@main" --upgrade --no-deps
+    # Need to uninstall streamz that is already in the env.
+    pip uninstall -y streamz
+    pip install "git+https://github.com/python-streamz/streamz.git@master" --upgrade --no-deps
+    set +x
+}
+
+################################################################################
+# INSTALL - Install libcudf artifacts
+################################################################################
+
+export LIB_BUILD_DIR="$WORKSPACE/ci/artifacts/cudf/cpu/libcudf_work/cpp/build"
+export CUDF_ROOT=${LIB_BUILD_DIR}
+export LD_LIBRARY_PATH="$LIB_BUILD_DIR:$CONDA_PREFIX/lib:$LD_LIBRARY_PATH"
+
+CUDF_CONDA_FILE=`find ${CONDA_ARTIFACT_PATH} -name "libcudf-*.tar.bz2"`
+CUDF_CONDA_FILE=`basename "$CUDF_CONDA_FILE" .tar.bz2` #get filename without extension
+CUDF_CONDA_FILE=${CUDF_CONDA_FILE//-/=} #convert to conda install
+KAFKA_CONDA_FILE=`find ${CONDA_ARTIFACT_PATH} -name "libcudf_kafka-*.tar.bz2"`
+KAFKA_CONDA_FILE=`basename "$KAFKA_CONDA_FILE" .tar.bz2` #get filename without extension
+KAFKA_CONDA_FILE=${KAFKA_CONDA_FILE//-/=} #convert to conda install
+
+gpuci_logger "Installing $CUDF_CONDA_FILE & $KAFKA_CONDA_FILE"
+conda install -c ${CONDA_ARTIFACT_PATH} "$CUDF_CONDA_FILE" "$KAFKA_CONDA_FILE"
+
+install_dask
+
+################################################################################
+# TEST - Run java tests
+################################################################################
+
+gpuci_logger "Check GPU usage"
+nvidia-smi
+
+gpuci_logger "Running Java Tests"
+cd ${WORKSPACE}/java
+mvn test -B -DCUDF_JNI_ARROW_STATIC=OFF
+
+return ${EXITCODE}
diff --git a/conda/environments/cudf_dev_cuda11.0.yml b/conda/environments/cudf_dev_cuda11.0.yml
index 1568327f88c..30586c91351 100644
--- a/conda/environments/cudf_dev_cuda11.0.yml
+++ b/conda/environments/cudf_dev_cuda11.0.yml
@@ -7,8 +7,8 @@ channels:
   - rapidsai-nightly
   - conda-forge
 dependencies:
-  - clang=8.0.1
-  - clang-tools=8.0.1
+  - clang=11.0.0
+  - clang-tools=11.0.0
   - cupy>7.1.0,<10.0.0a0
   - rmm=21.08.*
   - cmake>=3.20.1
@@ -17,7 +17,7 @@ dependencies:
   - numba>=0.53.1
   - numpy
   - pandas>=1.0,<1.3.0dev0
-  - pyarrow=1.0.1
+  - pyarrow=4.0.1=*cuda
   - fastavro>=0.22.9
   - notebook>=0.5.0
   - cython>=0.29,<0.30
@@ -31,7 +31,6 @@ dependencies:
   - nbsphinx
   - numpydoc
   - ipython
-  - recommonmark
   - pandoc=<2.0.0
   - cudatoolkit=11.0
   - pip
@@ -44,12 +43,11 @@ dependencies:
   - dask>=2021.6.0
   - distributed>=2021.6.0
   - streamz
+  - arrow-cpp=4.0.1
   - dlpack>=0.5,<0.6.0a0
-  - arrow-cpp=1.0.1
   - arrow-cpp-proc * cuda
   - double-conversion
   - rapidjson
-  - flatbuffers
   - hypothesis
   - sphinx-markdown-tables
   - sphinx-copybutton
diff --git a/conda/environments/cudf_dev_cuda11.2.yml b/conda/environments/cudf_dev_cuda11.2.yml
index 9d520ada253..f2bc5a21079 100644
--- a/conda/environments/cudf_dev_cuda11.2.yml
+++ b/conda/environments/cudf_dev_cuda11.2.yml
@@ -7,8 +7,8 @@ channels:
   - rapidsai-nightly
   - conda-forge
 dependencies:
-  - clang=8.0.1
-  - clang-tools=8.0.1
+  - clang=11.0.0
+  - clang-tools=11.0.0
   - cupy>7.1.0,<10.0.0a0
   - rmm=21.08.*
   - cmake>=3.20.1
@@ -17,7 +17,7 @@ dependencies:
   - numba>=0.53.1
   - numpy
   - pandas>=1.0,<1.3.0dev0
-  - pyarrow=1.0.1
+  - pyarrow=4.0.1=*cuda
   - fastavro>=0.22.9
   - notebook>=0.5.0
   - cython>=0.29,<0.30
@@ -31,7 +31,6 @@ dependencies:
   - nbsphinx
   - numpydoc
   - ipython
-  - recommonmark
   - pandoc=<2.0.0
   - cudatoolkit=11.2
   - pip
@@ -44,12 +43,11 @@ dependencies:
   - dask>=2021.6.0
   - distributed>=2021.6.0
   - streamz
+  - arrow-cpp=4.0.1
   - dlpack>=0.5,<0.6.0a0
-  - arrow-cpp=1.0.1
   - arrow-cpp-proc * cuda
   - double-conversion
   - rapidjson
-  - flatbuffers
   - hypothesis
   - sphinx-markdown-tables
   - sphinx-copybutton
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index d1aaf924555..9023e89c2f5 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -30,7 +30,7 @@ requirements:
     - setuptools
     - numba >=0.53.1
     - dlpack>=0.5,<0.6.0a0
-    - pyarrow 1.0.1
+    - pyarrow 4.0.1 *cuda
     - libcudf {{ version }}
     - rmm {{ minor_version }}
     - cudatoolkit {{ cuda_version }}
@@ -42,7 +42,7 @@ requirements:
     - cupy >7.1.0,<10.0.0a0
     - numba >=0.53.1
     - numpy
-    - {{ pin_compatible('pyarrow', max_pin='x.x.x') }}
+    - {{ pin_compatible('pyarrow', max_pin='x.x.x') }} *cuda
     - fastavro >=0.22.0
     - {{ pin_compatible('rmm', max_pin='x.x') }}
     - fsspec>=0.6.0
diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index 14b94dd2249..6c4175a2539 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -37,11 +37,12 @@ requirements:
   host:
     - librmm {{ minor_version }}.*
     - cudatoolkit {{ cuda_version }}.*
-    - arrow-cpp 1.0.1
+    - arrow-cpp 4.0.1 *cuda
     - arrow-cpp-proc * cuda
     - dlpack>=0.5,<0.6.0a0
   run:
     - {{ pin_compatible('cudatoolkit', max_pin='x.x') }}
+    - arrow-cpp * *cuda
     - arrow-cpp-proc * cuda
     - {{ pin_compatible('dlpack', max_pin='x.x') }}
 
@@ -220,6 +221,7 @@ test:
     - test -f $PREFIX/include/cudf/utilities/error.hpp
     - test -f $PREFIX/include/cudf/utilities/traits.hpp
     - test -f $PREFIX/include/cudf/utilities/type_dispatcher.hpp
+    - test -f $PREFIX/include/cudf/utilities/type_checks.hpp
     - test -f $PREFIX/include/cudf/utilities/default_stream.hpp
     - test -f $PREFIX/include/cudf/wrappers/dictionary.hpp
     - test -f $PREFIX/include/cudf/wrappers/durations.hpp
diff --git a/conda/recipes/libcudf_kafka/meta.yaml b/conda/recipes/libcudf_kafka/meta.yaml
index f1ec813a17f..6b15890e7c7 100644
--- a/conda/recipes/libcudf_kafka/meta.yaml
+++ b/conda/recipes/libcudf_kafka/meta.yaml
@@ -25,8 +25,8 @@ requirements:
   build:
     - cmake >=3.20.1
   host:
-    - libcudf {{ version }}
-    - librdkafka >=1.5.0,<1.5.3
+    - libcudf {{version}}
+    - librdkafka >=1.6.0,<1.7.0a0
   run:
     - {{ pin_compatible('librdkafka', max_pin='x.x') }} #TODO: librdkafka should be automatically included here by run_exports but is not
 
diff --git a/cpp/.clang-format b/cpp/.clang-format
index 11404b0226e..0c05436e922 100644
--- a/cpp/.clang-format
+++ b/cpp/.clang-format
@@ -6,16 +6,22 @@ Language: Cpp
 AccessModifierOffset: -1
 AlignAfterOpenBracket: Align
 AlignConsecutiveAssignments: true
+AlignConsecutiveBitFields: true
 AlignConsecutiveDeclarations: false
+AlignConsecutiveMacros: true
 AlignEscapedNewlines: Left
 AlignOperands: true
 AlignTrailingComments: true
+AllowAllArgumentsOnNextLine: true
+AllowAllConstructorInitializersOnNextLine: true
 AllowAllParametersOfDeclarationOnNextLine: true
 AllowShortBlocksOnASingleLine: true 
 AllowShortCaseLabelsOnASingleLine: true
+AllowShortEnumsOnASingleLine: true
 AllowShortFunctionsOnASingleLine: All
 AllowShortIfStatementsOnASingleLine: true
-AllowShortLoopsOnASingleLine: true
+AllowShortLambdasOnASingleLine: true
+AllowShortLoopsOnASingleLine: false
 # This is deprecated
 AlwaysBreakAfterDefinitionReturnType: None
 AlwaysBreakAfterReturnType: None
@@ -40,14 +46,14 @@ BraceWrapping:
   SplitEmptyFunction: false
   SplitEmptyRecord: false
   SplitEmptyNamespace: false
+BreakAfterJavaFieldAnnotations: false
 BreakBeforeBinaryOperators: None
 BreakBeforeBraces: WebKit
 BreakBeforeInheritanceComma: false
-BreakInheritanceList: BeforeColon
 BreakBeforeTernaryOperators: true
 BreakConstructorInitializersBeforeComma: false
 BreakConstructorInitializers: BeforeColon
-BreakAfterJavaFieldAnnotations: false
+BreakInheritanceList: BeforeColon
 BreakStringLiterals: true
 ColumnLimit: 100
 CommentPragmas: '^ IWYU pragma:'
@@ -57,7 +63,7 @@ ConstructorInitializerAllOnOneLineOrOnePerLine: true
 ConstructorInitializerIndentWidth: 2
 ContinuationIndentWidth: 2
 Cpp11BracedListStyle: true
-DerivePointerAlignment: true
+DerivePointerAlignment: false
 DisableFormat: false
 ExperimentalAutoDetectBinPacking: false
 FixNamespaceComments: true
@@ -139,14 +145,17 @@ SpaceBeforeCtorInitializerColon: true
 SpaceBeforeInheritanceColon: true
 SpaceBeforeParens: ControlStatements
 SpaceBeforeRangeBasedForLoopColon: true
+SpaceBeforeSquareBrackets: false
+SpaceInEmptyBlock: false
 SpaceInEmptyParentheses: false
 SpacesBeforeTrailingComments: 2
 SpacesInAngles: false
+SpacesInConditionalStatement: false
 SpacesInContainerLiterals: true
 SpacesInCStyleCastParentheses: false
 SpacesInParentheses: false
 SpacesInSquareBrackets: false
-Standard: Cpp11
+Standard: c++17
 StatementMacros:
   - Q_UNUSED
   - QT_REQUIRE_VERSION
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 678f202d106..ab7d8389c88 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -40,10 +40,12 @@ set(THREADS_PREFER_PTHREAD_FLAG ON)
 
 option(USE_NVTX "Build with NVTX support" ON)
 option(BUILD_TESTS "Configure CMake to build tests" ON)
-option(BUILD_BENCHMARKS "Configure CMake to build (google) benchmarks" OFF)
+option(BUILD_BENCHMARKS "Configure CMake to build (google & nvbench) benchmarks" OFF)
 option(BUILD_SHARED_LIBS "Build cuDF shared libraries" ON)
 option(JITIFY_USE_CACHE "Use a file cache for JIT compiled kernels" ON)
 option(CUDF_USE_ARROW_STATIC "Build and statically link Arrow libraries" OFF)
+option(CUDF_ENABLE_ARROW_PYTHON "Find (or build) Arrow with Python support" OFF)
+option(CUDF_ENABLE_ARROW_PARQUET "Find (or build) Arrow with Parquet support" OFF)
 option(CUDF_ENABLE_ARROW_S3 "Build/Enable AWS S3 Arrow filesystem support" ON)
 option(PER_THREAD_DEFAULT_STREAM "Build with per-thread default stream" OFF)
 option(DISABLE_DEPRECATION_WARNING "Disable warnings generated from deprecated declarations." OFF)
@@ -54,7 +56,7 @@ option(CUDA_STATIC_RUNTIME "Statically link the CUDA runtime" OFF)
 
 message(VERBOSE "CUDF: Build with NVTX support: ${USE_NVTX}")
 message(VERBOSE "CUDF: Configure CMake to build tests: ${BUILD_TESTS}")
-message(VERBOSE "CUDF: Configure CMake to build (google) benchmarks: ${BUILD_BENCHMARKS}")
+message(VERBOSE "CUDF: Configure CMake to build (google & nvbench) benchmarks: ${BUILD_BENCHMARKS}")
 message(VERBOSE "CUDF: Build cuDF shared libraries: ${BUILD_SHARED_LIBS}")
 message(VERBOSE "CUDF: Use a file cache for JIT compiled kernels: ${JITIFY_USE_CACHE}")
 message(VERBOSE "CUDF: Build and statically link Arrow libraries: ${CUDF_USE_ARROW_STATIC}")
@@ -153,6 +155,34 @@ add_library(cudf
     src/ast/transform.cu
     src/binaryop/binaryop.cpp
     src/binaryop/compiled/binary_ops.cu
+    src/binaryop/compiled/Add.cu
+    src/binaryop/compiled/ATan2.cu
+    src/binaryop/compiled/BitwiseAnd.cu
+    src/binaryop/compiled/BitwiseOr.cu
+    src/binaryop/compiled/BitwiseXor.cu
+    src/binaryop/compiled/Less.cu
+    src/binaryop/compiled/Greater.cu
+    src/binaryop/compiled/LessEqual.cu
+    src/binaryop/compiled/GreaterEqual.cu
+    src/binaryop/compiled/Div.cu
+    src/binaryop/compiled/equality_ops.cu
+    src/binaryop/compiled/FloorDiv.cu
+    src/binaryop/compiled/LogBase.cu
+    src/binaryop/compiled/LogicalAnd.cu
+    src/binaryop/compiled/LogicalOr.cu
+    src/binaryop/compiled/Mod.cu
+    src/binaryop/compiled/Mul.cu
+    src/binaryop/compiled/NullMax.cu
+    src/binaryop/compiled/NullMin.cu
+    src/binaryop/compiled/PMod.cu
+    src/binaryop/compiled/Pow.cu
+    src/binaryop/compiled/PyMod.cu
+    src/binaryop/compiled/ShiftLeft.cu
+    src/binaryop/compiled/ShiftRight.cu
+    src/binaryop/compiled/ShiftRightUnsigned.cu
+    src/binaryop/compiled/Sub.cu
+    src/binaryop/compiled/TrueDiv.cu
+    src/binaryop/compiled/util.cpp
     src/labeling/label_bins.cu
     src/bitmask/null_mask.cu
     src/bitmask/is_element_valid.cpp
@@ -194,14 +224,16 @@ add_library(cudf
     src/filling/sequence.cu
     src/groupby/groupby.cu
     src/groupby/hash/groupby.cu
+    src/groupby/sort/aggregate.cpp
     src/groupby/sort/group_argmax.cu
     src/groupby/sort/group_argmin.cu
-    src/groupby/sort/aggregate.cpp
     src/groupby/sort/group_collect.cu
-    src/groupby/sort/group_merge_lists.cu
     src/groupby/sort/group_count.cu
+    src/groupby/sort/group_m2.cu
     src/groupby/sort/group_max.cu
     src/groupby/sort/group_min.cu
+    src/groupby/sort/group_merge_lists.cu
+    src/groupby/sort/group_merge_m2.cu
     src/groupby/sort/group_nth_element.cu
     src/groupby/sort/group_nunique.cu
     src/groupby/sort/group_product.cu
@@ -272,7 +304,7 @@ add_library(cudf
     src/join/join.cu
     src/join/semi_join.cu
     src/lists/contains.cu
-    src/lists/combine/concatenate_list_elements.cu		
+    src/lists/combine/concatenate_list_elements.cu
     src/lists/combine/concatenate_rows.cu
     src/lists/copying/concatenate.cu
     src/lists/copying/copying.cu
@@ -354,6 +386,7 @@ add_library(cudf
     src/strings/convert/convert_urls.cu
     src/strings/copying/concatenate.cu
     src/strings/copying/copying.cu
+    src/strings/copying/shift.cu
     src/strings/extract.cu
     src/strings/filling/fill.cu
     src/strings/filter_chars.cu
@@ -411,6 +444,7 @@ add_library(cudf
     src/unary/nan_ops.cu
     src/unary/null_ops.cu
     src/utilities/default_stream.cpp
+    src/utilities/type_checks.cpp
 )
 
 set_target_properties(cudf
@@ -575,6 +609,8 @@ if(CUDF_BUILD_BENCHMARKS)
         GIT_SHALLOW     TRUE
         OPTIONS         "BENCHMARK_ENABLE_TESTING OFF"
                         "BENCHMARK_ENABLE_INSTALL OFF")
+    # Find or install NVBench
+    include(cmake/thirdparty/CUDF_GetNVBench.cmake)
     add_subdirectory(benchmarks)
 endif()
 
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index e8ccb24f44c..e5bee4771df 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -50,11 +50,19 @@ target_link_libraries(cudf_benchmark_common PRIVATE cudf_datagen)
 function(ConfigureBench CMAKE_BENCH_NAME)
     add_executable(${CMAKE_BENCH_NAME} ${ARGN})
     set_target_properties(${CMAKE_BENCH_NAME}
-        PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$<BUILD_INTERFACE:${CUDF_BINARY_DIR}/gbenchmarks>")
+        PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$<BUILD_INTERFACE:${CUDF_BINARY_DIR}/benchmarks>")
     target_link_libraries(${CMAKE_BENCH_NAME}
         PRIVATE cudf_benchmark_common cudf_datagen benchmark::benchmark_main)
 endfunction()
 
+function(ConfigureNVBench CMAKE_BENCH_NAME)
+    add_executable(${CMAKE_BENCH_NAME} ${ARGN})
+    set_target_properties(${CMAKE_BENCH_NAME}
+        PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$<BUILD_INTERFACE:${CUDF_BINARY_DIR}/benchmarks>")
+    target_link_libraries(${CMAKE_BENCH_NAME}
+        PRIVATE cudf_benchmark_common cudf_datagen nvbench::main)
+endfunction()
+
 ###################################################################################################
 # - column benchmarks -----------------------------------------------------------------------------
 ConfigureBench(COLUMN_CONCAT_BENCH column/concatenate_benchmark.cpp)
@@ -93,7 +101,7 @@ ConfigureBench(STREAM_COMPACTION_BENCH stream_compaction/drop_duplicates_benchma
 
 ###################################################################################################
 # - join benchmark --------------------------------------------------------------------------------
-ConfigureBench(JOIN_BENCH join/join_benchmark.cu)
+ConfigureBench(JOIN_BENCH join/join_benchmark.cu join/conditional_join_benchmark.cu)
 
 ###################################################################################################
 # - iterator benchmark ----------------------------------------------------------------------------
@@ -195,6 +203,7 @@ ConfigureBench(AST_BENCH ast/transform_benchmark.cpp)
 # - binaryop benchmark ----------------------------------------------------------------------------
 ConfigureBench(BINARYOP_BENCH
   binaryop/binaryop_benchmark.cpp
+  binaryop/compiled_binaryop_benchmark.cpp
   binaryop/jit_binaryop_benchmark.cpp)
 
 ###################################################################################################
diff --git a/cpp/benchmarks/ast/transform_benchmark.cpp b/cpp/benchmarks/ast/transform_benchmark.cpp
index d39faec3ac4..6f131cf0d6a 100644
--- a/cpp/benchmarks/ast/transform_benchmark.cpp
+++ b/cpp/benchmarks/ast/transform_benchmark.cpp
@@ -30,9 +30,9 @@
 #include <thrust/iterator/counting_iterator.h>
 
 #include <algorithm>
-#include <iostream>
 #include <list>
 #include <numeric>
+#include <random>
 #include <vector>
 
 enum class TreeType {
@@ -40,11 +40,11 @@ enum class TreeType {
                    // child column reference
 };
 
-template <typename key_type, TreeType tree_type, bool reuse_columns>
+template <typename key_type, TreeType tree_type, bool reuse_columns, bool Nullable>
 class AST : public cudf::benchmark {
 };
 
-template <typename key_type, TreeType tree_type, bool reuse_columns>
+template <typename key_type, TreeType tree_type, bool reuse_columns, bool Nullable>
 static void BM_ast_transform(benchmark::State& state)
 {
   const cudf::size_type table_size{(cudf::size_type)state.range(0)};
@@ -56,10 +56,24 @@ static void BM_ast_transform(benchmark::State& state)
   auto columns         = std::vector<cudf::column_view>(n_cols);
 
   auto data_iterator = thrust::make_counting_iterator(0);
-  std::generate_n(column_wrappers.begin(), n_cols, [=]() {
-    return cudf::test::fixed_width_column_wrapper<key_type>(data_iterator,
-                                                            data_iterator + table_size);
-  });
+
+  if constexpr (Nullable) {
+    auto validities = std::vector<bool>(table_size);
+    std::random_device rd;
+    std::mt19937 gen(rd());
+
+    std::generate(
+      validities.begin(), validities.end(), [&]() { return gen() > (0.5 * gen.max()); });
+    std::generate_n(column_wrappers.begin(), n_cols, [=]() {
+      return cudf::test::fixed_width_column_wrapper<key_type>(
+        data_iterator, data_iterator + table_size, validities.begin());
+    });
+  } else {
+    std::generate_n(column_wrappers.begin(), n_cols, [=]() {
+      return cudf::test::fixed_width_column_wrapper<key_type>(data_iterator,
+                                                              data_iterator + table_size);
+    });
+  }
   std::transform(
     column_wrappers.begin(), column_wrappers.end(), columns.begin(), [](auto const& col) {
       return static_cast<cudf::column_view>(col);
@@ -113,29 +127,32 @@ static void BM_ast_transform(benchmark::State& state)
                           (tree_levels + 1) * sizeof(key_type));
 }
 
-#define AST_TRANSFORM_BENCHMARK_DEFINE(name, key_type, tree_type, reuse_columns) \
-  BENCHMARK_TEMPLATE_DEFINE_F(AST, name, key_type, tree_type, reuse_columns)     \
-  (::benchmark::State & st) { BM_ast_transform<key_type, tree_type, reuse_columns>(st); }
-
-AST_TRANSFORM_BENCHMARK_DEFINE(ast_int32_imbalanced_unique,
-                               int32_t,
-                               TreeType::IMBALANCED_LEFT,
-                               false);
-AST_TRANSFORM_BENCHMARK_DEFINE(ast_int32_imbalanced_reuse,
-                               int32_t,
-                               TreeType::IMBALANCED_LEFT,
-                               true);
-AST_TRANSFORM_BENCHMARK_DEFINE(ast_double_imbalanced_unique,
-                               double,
-                               TreeType::IMBALANCED_LEFT,
-                               false);
+#define AST_TRANSFORM_BENCHMARK_DEFINE(name, key_type, tree_type, reuse_columns, nullable) \
+  BENCHMARK_TEMPLATE_DEFINE_F(AST, name, key_type, tree_type, reuse_columns, nullable)     \
+  (::benchmark::State & st) { BM_ast_transform<key_type, tree_type, reuse_columns, nullable>(st); }
+
+AST_TRANSFORM_BENCHMARK_DEFINE(
+  ast_int32_imbalanced_unique, int32_t, TreeType::IMBALANCED_LEFT, false, false);
+AST_TRANSFORM_BENCHMARK_DEFINE(
+  ast_int32_imbalanced_reuse, int32_t, TreeType::IMBALANCED_LEFT, true, false);
+AST_TRANSFORM_BENCHMARK_DEFINE(
+  ast_double_imbalanced_unique, double, TreeType::IMBALANCED_LEFT, false, false);
+
+AST_TRANSFORM_BENCHMARK_DEFINE(
+  ast_int32_imbalanced_unique_nulls, int32_t, TreeType::IMBALANCED_LEFT, false, true);
+AST_TRANSFORM_BENCHMARK_DEFINE(
+  ast_int32_imbalanced_reuse_nulls, int32_t, TreeType::IMBALANCED_LEFT, true, true);
+AST_TRANSFORM_BENCHMARK_DEFINE(
+  ast_double_imbalanced_unique_nulls, double, TreeType::IMBALANCED_LEFT, false, true);
 
 static void CustomRanges(benchmark::internal::Benchmark* b)
 {
   auto row_counts       = std::vector<cudf::size_type>{100'000, 1'000'000, 10'000'000, 100'000'000};
   auto operation_counts = std::vector<cudf::size_type>{1, 5, 10};
   for (auto const& row_count : row_counts) {
-    for (auto const& operation_count : operation_counts) { b->Args({row_count, operation_count}); }
+    for (auto const& operation_count : operation_counts) {
+      b->Args({row_count, operation_count});
+    }
   }
 }
 
@@ -153,3 +170,18 @@ BENCHMARK_REGISTER_F(AST, ast_double_imbalanced_unique)
   ->Apply(CustomRanges)
   ->Unit(benchmark::kMillisecond)
   ->UseManualTime();
+
+BENCHMARK_REGISTER_F(AST, ast_int32_imbalanced_unique_nulls)
+  ->Apply(CustomRanges)
+  ->Unit(benchmark::kMillisecond)
+  ->UseManualTime();
+
+BENCHMARK_REGISTER_F(AST, ast_int32_imbalanced_reuse_nulls)
+  ->Apply(CustomRanges)
+  ->Unit(benchmark::kMillisecond)
+  ->UseManualTime();
+
+BENCHMARK_REGISTER_F(AST, ast_double_imbalanced_unique_nulls)
+  ->Apply(CustomRanges)
+  ->Unit(benchmark::kMillisecond)
+  ->UseManualTime();
diff --git a/cpp/benchmarks/binaryop/binaryop_benchmark.cpp b/cpp/benchmarks/binaryop/binaryop_benchmark.cpp
index 753dcc83b54..314d657679b 100644
--- a/cpp/benchmarks/binaryop/binaryop_benchmark.cpp
+++ b/cpp/benchmarks/binaryop/binaryop_benchmark.cpp
@@ -113,7 +113,9 @@ static void CustomRanges(benchmark::internal::Benchmark* b)
   auto row_counts       = std::vector<cudf::size_type>{100'000, 1'000'000, 10'000'000, 100'000'000};
   auto operation_counts = std::vector<cudf::size_type>{1, 2, 5, 10};
   for (auto const& row_count : row_counts) {
-    for (auto const& operation_count : operation_counts) { b->Args({row_count, operation_count}); }
+    for (auto const& operation_count : operation_counts) {
+      b->Args({row_count, operation_count});
+    }
   }
 }
 
diff --git a/cpp/benchmarks/binaryop/compiled_binaryop_benchmark.cpp b/cpp/benchmarks/binaryop/compiled_binaryop_benchmark.cpp
new file mode 100644
index 00000000000..aa86f3bedf8
--- /dev/null
+++ b/cpp/benchmarks/binaryop/compiled_binaryop_benchmark.cpp
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <fixture/benchmark_fixture.hpp>
+#include <synchronization/synchronization.hpp>
+
+#include <cudf_test/column_wrapper.hpp>
+
+#include <cudf/binaryop.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+
+template <typename TypeLhs, typename TypeRhs, typename TypeOut, cudf::binary_operator>
+class COMPILED_BINARYOP : public cudf::benchmark {
+};
+
+template <typename TypeLhs, typename TypeRhs, typename TypeOut>
+void BM_compiled_binaryop(benchmark::State& state, cudf::binary_operator binop)
+{
+  const cudf::size_type column_size{(cudf::size_type)state.range(0)};
+
+  auto data_it = thrust::make_counting_iterator(0);
+  cudf::test::fixed_width_column_wrapper<TypeLhs> input1(data_it, data_it + column_size);
+  cudf::test::fixed_width_column_wrapper<TypeRhs> input2(data_it, data_it + column_size);
+
+  auto lhs          = cudf::column_view(input1);
+  auto rhs          = cudf::column_view(input2);
+  auto output_dtype = cudf::data_type(cudf::type_to_id<TypeOut>());
+
+  // Call once for hot cache.
+  cudf::experimental::binary_operation(lhs, rhs, binop, output_dtype);
+
+  for (auto _ : state) {
+    cuda_event_timer timer(state, true);
+    cudf::experimental::binary_operation(lhs, rhs, binop, output_dtype);
+  }
+}
+
+// TODO tparam boolean for null.
+#define BINARYOP_BENCHMARK_DEFINE(TypeLhs, TypeRhs, binop, TypeOut)                    \
+  BENCHMARK_TEMPLATE_DEFINE_F(                                                         \
+    COMPILED_BINARYOP, binop, TypeLhs, TypeRhs, TypeOut, cudf::binary_operator::binop) \
+  (::benchmark::State & st)                                                            \
+  {                                                                                    \
+    BM_compiled_binaryop<TypeLhs, TypeRhs, TypeOut>(st, cudf::binary_operator::binop); \
+  }                                                                                    \
+  BENCHMARK_REGISTER_F(COMPILED_BINARYOP, binop)                                       \
+    ->Unit(benchmark::kMicrosecond)                                                    \
+    ->UseManualTime()                                                                  \
+    ->Arg(10000)      /* 10k */                                                        \
+    ->Arg(100000)     /* 100k */                                                       \
+    ->Arg(1000000)    /* 1M */                                                         \
+    ->Arg(10000000)   /* 10M */                                                        \
+    ->Arg(100000000); /* 100M */
+
+using namespace cudf;
+using namespace numeric;
+
+// clang-format off
+BINARYOP_BENCHMARK_DEFINE(float,        int64_t,      ADD,                  int32_t);
+BINARYOP_BENCHMARK_DEFINE(duration_s,   duration_D,   SUB,                  duration_ms);
+BINARYOP_BENCHMARK_DEFINE(float,        float,        MUL,                  int64_t);
+BINARYOP_BENCHMARK_DEFINE(int64_t,      int64_t,      DIV,                  int64_t);
+BINARYOP_BENCHMARK_DEFINE(int64_t,      int64_t,      TRUE_DIV,             int64_t);
+BINARYOP_BENCHMARK_DEFINE(int64_t,      int64_t,      FLOOR_DIV,            int64_t);
+BINARYOP_BENCHMARK_DEFINE(double,       double,       MOD,                  double);
+BINARYOP_BENCHMARK_DEFINE(int32_t,      int64_t,      PMOD,                 double);
+BINARYOP_BENCHMARK_DEFINE(int32_t,      uint8_t,      PYMOD,                int64_t);
+BINARYOP_BENCHMARK_DEFINE(int64_t,      int64_t,      POW,                  double);
+BINARYOP_BENCHMARK_DEFINE(float,        double,       LOG_BASE,             double);
+BINARYOP_BENCHMARK_DEFINE(float,        double,       ATAN2,                double);
+BINARYOP_BENCHMARK_DEFINE(int,          int,          SHIFT_LEFT,           int);
+BINARYOP_BENCHMARK_DEFINE(int16_t,      int64_t,      SHIFT_RIGHT,          int);
+BINARYOP_BENCHMARK_DEFINE(int64_t,      int32_t,      SHIFT_RIGHT_UNSIGNED, int64_t);
+BINARYOP_BENCHMARK_DEFINE(int64_t,      int32_t,      BITWISE_AND,          int16_t);
+BINARYOP_BENCHMARK_DEFINE(int16_t,      int32_t,      BITWISE_OR,           int64_t);
+BINARYOP_BENCHMARK_DEFINE(int16_t,      int64_t,      BITWISE_XOR,          int32_t);
+BINARYOP_BENCHMARK_DEFINE(double,       int8_t,       LOGICAL_AND,          bool);
+BINARYOP_BENCHMARK_DEFINE(int16_t,      int64_t,      LOGICAL_OR,           bool);
+BINARYOP_BENCHMARK_DEFINE(duration_ms,  duration_ns,  EQUAL,                bool);
+BINARYOP_BENCHMARK_DEFINE(decimal32,    decimal32,    NOT_EQUAL,            bool);
+BINARYOP_BENCHMARK_DEFINE(timestamp_s,  timestamp_s,  LESS,                 bool);
+BINARYOP_BENCHMARK_DEFINE(timestamp_ms, timestamp_s,  GREATER,              bool);
+BINARYOP_BENCHMARK_DEFINE(duration_ms,  duration_ns,  NULL_EQUALS,          bool);
+BINARYOP_BENCHMARK_DEFINE(decimal32,    decimal32,    NULL_MAX,             decimal32);
+BINARYOP_BENCHMARK_DEFINE(timestamp_D, timestamp_s,   NULL_MIN,             timestamp_s);
diff --git a/cpp/benchmarks/binaryop/jit_binaryop_benchmark.cpp b/cpp/benchmarks/binaryop/jit_binaryop_benchmark.cpp
index 29ca02a843d..3c02f47eeb7 100644
--- a/cpp/benchmarks/binaryop/jit_binaryop_benchmark.cpp
+++ b/cpp/benchmarks/binaryop/jit_binaryop_benchmark.cpp
@@ -23,7 +23,7 @@
 
 #include <thrust/iterator/counting_iterator.h>
 
-template <typename TypeLhs, typename TypeRhs, typename TypeOut>
+template <typename TypeLhs, typename TypeRhs, typename TypeOut, cudf::binary_operator>
 class JIT_BINARYOP : public cudf::benchmark {
 };
 
@@ -50,22 +50,24 @@ void BM_binaryop(benchmark::State& state, cudf::binary_operator binop)
 }
 
 // TODO tparam boolean for null.
-#define BINARYOP_BENCHMARK_DEFINE(TypeLhs, TypeRhs, binop, TypeOut)           \
-  BENCHMARK_TEMPLATE_DEFINE_F(JIT_BINARYOP, binop, TypeLhs, TypeRhs, TypeOut) \
-  (::benchmark::State & st)                                                   \
-  {                                                                           \
-    BM_binaryop<TypeLhs, TypeRhs, TypeOut>(st, cudf::binary_operator::binop); \
-  }                                                                           \
-  BENCHMARK_REGISTER_F(JIT_BINARYOP, binop)                                   \
-    ->Unit(benchmark::kMillisecond)                                           \
-    ->UseManualTime()                                                         \
-    ->Arg(10000)      /* 10k */                                               \
-    ->Arg(100000)     /* 100k */                                              \
-    ->Arg(1000000)    /* 1M */                                                \
-    ->Arg(10000000)   /* 10M */                                               \
+#define BINARYOP_BENCHMARK_DEFINE(TypeLhs, TypeRhs, binop, TypeOut)               \
+  BENCHMARK_TEMPLATE_DEFINE_F(                                                    \
+    JIT_BINARYOP, binop, TypeLhs, TypeRhs, TypeOut, cudf::binary_operator::binop) \
+  (::benchmark::State & st)                                                       \
+  {                                                                               \
+    BM_binaryop<TypeLhs, TypeRhs, TypeOut>(st, cudf::binary_operator::binop);     \
+  }                                                                               \
+  BENCHMARK_REGISTER_F(JIT_BINARYOP, binop)                                       \
+    ->Unit(benchmark::kMicrosecond)                                               \
+    ->UseManualTime()                                                             \
+    ->Arg(10000)      /* 10k */                                                   \
+    ->Arg(100000)     /* 100k */                                                  \
+    ->Arg(1000000)    /* 1M */                                                    \
+    ->Arg(10000000)   /* 10M */                                                   \
     ->Arg(100000000); /* 100M */
 
 using namespace cudf;
+using namespace numeric;
 
 // clang-format off
 BINARYOP_BENCHMARK_DEFINE(float,        int64_t,      ADD,                  int32_t);
@@ -75,16 +77,23 @@ BINARYOP_BENCHMARK_DEFINE(int64_t,      int64_t,      DIV,                  int6
 BINARYOP_BENCHMARK_DEFINE(int64_t,      int64_t,      TRUE_DIV,             int64_t);
 BINARYOP_BENCHMARK_DEFINE(int64_t,      int64_t,      FLOOR_DIV,            int64_t);
 BINARYOP_BENCHMARK_DEFINE(double,       double,       MOD,                  double);
+BINARYOP_BENCHMARK_DEFINE(int32_t,      int64_t,      PMOD,                 double);
+BINARYOP_BENCHMARK_DEFINE(int32_t,      uint8_t,      PYMOD,                int64_t);
 BINARYOP_BENCHMARK_DEFINE(int64_t,      int64_t,      POW,                  double);
+BINARYOP_BENCHMARK_DEFINE(float,        double,       LOG_BASE,             double);
+BINARYOP_BENCHMARK_DEFINE(float,        double,       ATAN2,                double);
+BINARYOP_BENCHMARK_DEFINE(int,          int,          SHIFT_LEFT,           int);
+BINARYOP_BENCHMARK_DEFINE(int16_t,      int64_t,      SHIFT_RIGHT,          int);
+BINARYOP_BENCHMARK_DEFINE(int64_t,      int32_t,      SHIFT_RIGHT_UNSIGNED, int64_t);
 BINARYOP_BENCHMARK_DEFINE(int64_t,      int32_t,      BITWISE_AND,          int16_t);
 BINARYOP_BENCHMARK_DEFINE(int16_t,      int32_t,      BITWISE_OR,           int64_t);
 BINARYOP_BENCHMARK_DEFINE(int16_t,      int64_t,      BITWISE_XOR,          int32_t);
-BINARYOP_BENCHMARK_DEFINE(double,       int8_t,       LOGICAL_AND,          int16_t);
+BINARYOP_BENCHMARK_DEFINE(double,       int8_t,       LOGICAL_AND,          bool);
 BINARYOP_BENCHMARK_DEFINE(int16_t,      int64_t,      LOGICAL_OR,           bool);
+BINARYOP_BENCHMARK_DEFINE(duration_ms,  duration_ns,  EQUAL,                bool);
+BINARYOP_BENCHMARK_DEFINE(decimal32,    decimal32,    NOT_EQUAL,            bool);
 BINARYOP_BENCHMARK_DEFINE(timestamp_s,  timestamp_s,  LESS,                 bool);
 BINARYOP_BENCHMARK_DEFINE(timestamp_ms, timestamp_s,  GREATER,              bool);
-BINARYOP_BENCHMARK_DEFINE(int,          int,          SHIFT_LEFT,           int);
-BINARYOP_BENCHMARK_DEFINE(int16_t,      int64_t,      SHIFT_RIGHT,          int);
-BINARYOP_BENCHMARK_DEFINE(int64_t,      int32_t,      SHIFT_RIGHT_UNSIGNED, int64_t);
-BINARYOP_BENCHMARK_DEFINE(int32_t,      int64_t,      PMOD,                 double);
-BINARYOP_BENCHMARK_DEFINE(float,        double,       ATAN2,                double);
+BINARYOP_BENCHMARK_DEFINE(duration_ms,  duration_ns,  NULL_EQUALS,          bool);
+BINARYOP_BENCHMARK_DEFINE(decimal32,    decimal32,    NULL_MAX,             decimal32);
+BINARYOP_BENCHMARK_DEFINE(timestamp_D, timestamp_s,   NULL_MIN,             timestamp_s);
diff --git a/cpp/benchmarks/common/generate_benchmark_input.cpp b/cpp/benchmarks/common/generate_benchmark_input.cpp
index 591e42ceddf..ea54d4daf05 100644
--- a/cpp/benchmarks/common/generate_benchmark_input.cpp
+++ b/cpp/benchmarks/common/generate_benchmark_input.cpp
@@ -53,7 +53,7 @@ T get_distribution_mean(distribution_params<T> const& dist)
       auto const range_size = dist.lower_bound < dist.upper_bound
                                 ? dist.upper_bound - dist.lower_bound
                                 : dist.lower_bound - dist.upper_bound;
-      auto const p = geometric_dist_p(range_size);
+      auto const p          = geometric_dist_p(range_size);
       if (dist.lower_bound < dist.upper_bound)
         return dist.lower_bound + (1. / p);
       else
@@ -108,7 +108,8 @@ size_t avg_element_bytes(data_profile const& profile, cudf::type_id tid)
 /**
  * @brief Functor that computes a random column element with the given data profile.
  *
- * The implementation is SFINAEd for diffent type groups. Currently only used for fixed-width types.
+ * The implementation is SFINAEd for different type groups. Currently only used for fixed-width
+ * types.
  */
 template <typename T, typename Enable = void>
 struct random_value_fn;
diff --git a/cpp/benchmarks/common/generate_benchmark_input.hpp b/cpp/benchmarks/common/generate_benchmark_input.hpp
index acb8adc98e9..6c2a43a34e2 100644
--- a/cpp/benchmarks/common/generate_benchmark_input.hpp
+++ b/cpp/benchmarks/common/generate_benchmark_input.hpp
@@ -137,7 +137,7 @@ struct distribution_params<
 };
 
 /**
- * @brief Boolens are parameterized with the probability of getting `true` value.
+ * @brief Booleans are parameterized with the probability of getting `true` value.
  */
 template <typename T>
 struct distribution_params<T, typename std::enable_if_t<std::is_same<T, bool>::value>> {
@@ -195,7 +195,7 @@ std::vector<cudf::type_id> get_type_or_group(int32_t id);
  *
  * If an element of the input vector is a `cudf::type_id` enumerator, function return value simply
  * includes this type. If an element of the input vector is a `type_group_id` enumerator, function
- * return value includes all types coresponding to the group enumerator.
+ * return value includes all types corresponding to the group enumerator.
  *
  * @param ids Vector of integers equal to either a `cudf::type_id` enumerator or a `type_group_id`
  * enumerator.
diff --git a/cpp/benchmarks/hashing/partition_benchmark.cpp b/cpp/benchmarks/hashing/partition_benchmark.cpp
index d10b63dc4e1..185f19f28e5 100644
--- a/cpp/benchmarks/hashing/partition_benchmark.cpp
+++ b/cpp/benchmarks/hashing/partition_benchmark.cpp
@@ -65,7 +65,9 @@ static void CustomRanges(benchmark::internal::Benchmark* b)
 {
   for (int columns = 1; columns <= 256; columns *= 16) {
     for (int partitions = 64; partitions <= 1024; partitions *= 2) {
-      for (int rows = 1 << 17; rows <= 1 << 21; rows *= 2) { b->Args({rows, columns, partitions}); }
+      for (int rows = 1 << 17; rows <= 1 << 21; rows *= 2) {
+        b->Args({rows, columns, partitions});
+      }
     }
   }
 }
diff --git a/cpp/benchmarks/io/cuio_benchmark_common.cpp b/cpp/benchmarks/io/cuio_benchmark_common.cpp
index f2aa216d413..627ac9ccc04 100644
--- a/cpp/benchmarks/io/cuio_benchmark_common.cpp
+++ b/cpp/benchmarks/io/cuio_benchmark_common.cpp
@@ -94,7 +94,8 @@ std::vector<int> select_column_indexes(int num_cols, column_selection col_sel)
                 (col_sel == column_selection::SECOND_HALF) ? num_cols / 2 : 0);
       break;
     case column_selection::ALTERNATE:
-      for (size_t i = 0; i < col_idxs.size(); ++i) col_idxs[i] = 2 * i;
+      for (size_t i = 0; i < col_idxs.size(); ++i)
+        col_idxs[i] = 2 * i;
       break;
   }
   return col_idxs;
diff --git a/cpp/benchmarks/iterator/iterator_benchmark.cu b/cpp/benchmarks/iterator/iterator_benchmark.cu
index 04307f5db25..b4bb99abdde 100644
--- a/cpp/benchmarks/iterator/iterator_benchmark.cu
+++ b/cpp/benchmarks/iterator/iterator_benchmark.cu
@@ -61,7 +61,7 @@ inline auto reduce_by_cub(OutputIterator result, InputIterator d_in, int num_ite
 
 // -----------------------------------------------------------------------------
 template <typename T>
-void raw_stream_bench_cub(cudf::column_view &col, rmm::device_uvector<T> &result)
+void raw_stream_bench_cub(cudf::column_view& col, rmm::device_uvector<T>& result)
 {
   // std::cout << "raw stream cub: " << "\t";
 
@@ -73,7 +73,7 @@ void raw_stream_bench_cub(cudf::column_view &col, rmm::device_uvector<T> &result
 };
 
 template <typename T, bool has_null>
-void iterator_bench_cub(cudf::column_view &col, rmm::device_uvector<T> &result)
+void iterator_bench_cub(cudf::column_view& col, rmm::device_uvector<T>& result)
 {
   // std::cout << "iterator cub " << ( (has_null) ? "<true>: " : "<false>: " ) << "\t";
 
@@ -91,7 +91,7 @@ void iterator_bench_cub(cudf::column_view &col, rmm::device_uvector<T> &result)
 
 // -----------------------------------------------------------------------------
 template <typename T>
-void raw_stream_bench_thrust(cudf::column_view &col, rmm::device_uvector<T> &result)
+void raw_stream_bench_thrust(cudf::column_view& col, rmm::device_uvector<T>& result)
 {
   // std::cout << "raw stream thust: " << "\t\t";
 
@@ -102,7 +102,7 @@ void raw_stream_bench_thrust(cudf::column_view &col, rmm::device_uvector<T> &res
 }
 
 template <typename T, bool has_null>
-void iterator_bench_thrust(cudf::column_view &col, rmm::device_uvector<T> &result)
+void iterator_bench_thrust(cudf::column_view& col, rmm::device_uvector<T>& result)
 {
   // std::cout << "iterator thust " << ( (has_null) ? "<true>: " : "<false>: " ) << "\t";
 
@@ -124,7 +124,7 @@ class Iterator : public cudf::benchmark {
 };
 
 template <class TypeParam, bool cub_or_thrust, bool raw_or_iterator>
-void BM_iterator(benchmark::State &state)
+void BM_iterator(benchmark::State& state)
 {
   const cudf::size_type column_size{(cudf::size_type)state.range(0)};
   using T      = TypeParam;
@@ -165,8 +165,8 @@ __device__ thrust::pair<T, bool> operator+(thrust::pair<T, bool> lhs, thrust::pa
 }
 // -----------------------------------------------------------------------------
 template <typename T, bool has_null>
-void pair_iterator_bench_cub(cudf::column_view &col,
-                             rmm::device_uvector<thrust::pair<T, bool>> &result)
+void pair_iterator_bench_cub(cudf::column_view& col,
+                             rmm::device_uvector<thrust::pair<T, bool>>& result)
 {
   thrust::pair<T, bool> init{0, false};
   auto d_col    = cudf::column_device_view::create(col);
@@ -176,8 +176,8 @@ void pair_iterator_bench_cub(cudf::column_view &col,
 }
 
 template <typename T, bool has_null>
-void pair_iterator_bench_thrust(cudf::column_view &col,
-                                rmm::device_uvector<thrust::pair<T, bool>> &result)
+void pair_iterator_bench_thrust(cudf::column_view& col,
+                                rmm::device_uvector<thrust::pair<T, bool>>& result)
 {
   thrust::pair<T, bool> init{0, false};
   auto d_col = cudf::column_device_view::create(col);
@@ -187,7 +187,7 @@ void pair_iterator_bench_thrust(cudf::column_view &col,
 }
 
 template <class TypeParam, bool cub_or_thrust>
-void BM_pair_iterator(benchmark::State &state)
+void BM_pair_iterator(benchmark::State& state)
 {
   const cudf::size_type column_size{(cudf::size_type)state.range(0)};
   using T      = TypeParam;
diff --git a/cpp/benchmarks/join/conditional_join_benchmark.cu b/cpp/benchmarks/join/conditional_join_benchmark.cu
new file mode 100644
index 00000000000..4a655e29f74
--- /dev/null
+++ b/cpp/benchmarks/join/conditional_join_benchmark.cu
@@ -0,0 +1,379 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmark/benchmark.h>
+
+#include <thrust/iterator/counting_iterator.h>
+
+#include <cudf/ast/nodes.hpp>
+#include <cudf/ast/operators.hpp>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/join.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+
+#include <fixture/benchmark_fixture.hpp>
+#include <synchronization/synchronization.hpp>
+
+#include <vector>
+
+#include "generate_input_tables.cuh"
+
+template <typename key_type, typename payload_type>
+class ConditionalJoin : public cudf::benchmark {
+};
+
+template <typename key_type, typename payload_type, bool Nullable, typename Join>
+static void BM_join(benchmark::State& state, Join JoinFunc)
+{
+  const cudf::size_type build_table_size{(cudf::size_type)state.range(0)};
+  const cudf::size_type probe_table_size{(cudf::size_type)state.range(1)};
+  const cudf::size_type rand_max_val{build_table_size * 2};
+  const double selectivity             = 0.3;
+  const bool is_build_table_key_unique = true;
+
+  // Generate build and probe tables
+  cudf::test::UniformRandomGenerator<cudf::size_type> rand_gen(0, build_table_size);
+  auto build_random_null_mask = [&rand_gen](int size) {
+    if (Nullable) {
+      // roughly 25% nulls
+      auto validity = thrust::make_transform_iterator(
+        thrust::make_counting_iterator(0),
+        [&rand_gen](auto i) { return (rand_gen.generate() & 3) == 0; });
+      return cudf::test::detail::make_null_mask(validity, validity + size);
+    } else {
+      return cudf::create_null_mask(size, cudf::mask_state::UNINITIALIZED);
+    }
+  };
+
+  std::unique_ptr<cudf::column> build_key_column = [&]() {
+    return Nullable ? cudf::make_numeric_column(cudf::data_type(cudf::type_to_id<key_type>()),
+                                                build_table_size,
+                                                build_random_null_mask(build_table_size))
+                    : cudf::make_numeric_column(cudf::data_type(cudf::type_to_id<key_type>()),
+                                                build_table_size);
+  }();
+  std::unique_ptr<cudf::column> probe_key_column = [&]() {
+    return Nullable ? cudf::make_numeric_column(cudf::data_type(cudf::type_to_id<key_type>()),
+                                                probe_table_size,
+                                                build_random_null_mask(probe_table_size))
+                    : cudf::make_numeric_column(cudf::data_type(cudf::type_to_id<key_type>()),
+                                                probe_table_size);
+  }();
+
+  generate_input_tables<key_type, cudf::size_type>(
+    build_key_column->mutable_view().data<key_type>(),
+    build_table_size,
+    probe_key_column->mutable_view().data<key_type>(),
+    probe_table_size,
+    selectivity,
+    rand_max_val,
+    is_build_table_key_unique);
+
+  auto payload_data_it = thrust::make_counting_iterator(0);
+  cudf::test::fixed_width_column_wrapper<payload_type> build_payload_column(
+    payload_data_it, payload_data_it + build_table_size);
+
+  cudf::test::fixed_width_column_wrapper<payload_type> probe_payload_column(
+    payload_data_it, payload_data_it + probe_table_size);
+
+  CHECK_CUDA(0);
+
+  cudf::table_view build_table({build_key_column->view(), build_payload_column});
+  cudf::table_view probe_table({probe_key_column->view(), probe_payload_column});
+
+  // Benchmark the inner join operation
+
+  for (auto _ : state) {
+    cuda_event_timer raii(state, true, rmm::cuda_stream_default);
+
+    // Common column references.
+    const auto col_ref_left_0  = cudf::ast::column_reference(0);
+    const auto col_ref_right_0 = cudf::ast::column_reference(0, cudf::ast::table_reference::RIGHT);
+    auto left_zero_eq_right_zero =
+      cudf::ast::expression(cudf::ast::ast_operator::EQUAL, col_ref_left_0, col_ref_right_0);
+
+    auto result =
+      JoinFunc(probe_table, build_table, left_zero_eq_right_zero, cudf::null_equality::UNEQUAL);
+  }
+}
+
+#define CONDITIONAL_INNER_JOIN_BENCHMARK_DEFINE(name, key_type, payload_type, nullable) \
+  BENCHMARK_TEMPLATE_DEFINE_F(ConditionalJoin, name, key_type, payload_type)            \
+  (::benchmark::State & st)                                                             \
+  {                                                                                     \
+    auto join = [](cudf::table_view const& left,                                        \
+                   cudf::table_view const& right,                                       \
+                   cudf::ast::expression binary_pred,                                   \
+                   cudf::null_equality compare_nulls) {                                 \
+      return cudf::conditional_inner_join(left, right, binary_pred, compare_nulls);     \
+    };                                                                                  \
+    BM_join<key_type, payload_type, nullable>(st, join);                                \
+  }
+
+CONDITIONAL_INNER_JOIN_BENCHMARK_DEFINE(conditional_inner_join_32bit, int32_t, int32_t, false);
+CONDITIONAL_INNER_JOIN_BENCHMARK_DEFINE(conditional_inner_join_64bit, int64_t, int64_t, false);
+CONDITIONAL_INNER_JOIN_BENCHMARK_DEFINE(conditional_inner_join_32bit_nulls, int32_t, int32_t, true);
+CONDITIONAL_INNER_JOIN_BENCHMARK_DEFINE(conditional_inner_join_64bit_nulls, int64_t, int64_t, true);
+
+#define CONDITIONAL_LEFT_JOIN_BENCHMARK_DEFINE(name, key_type, payload_type, nullable) \
+  BENCHMARK_TEMPLATE_DEFINE_F(ConditionalJoin, name, key_type, payload_type)           \
+  (::benchmark::State & st)                                                            \
+  {                                                                                    \
+    auto join = [](cudf::table_view const& left,                                       \
+                   cudf::table_view const& right,                                      \
+                   cudf::ast::expression binary_pred,                                  \
+                   cudf::null_equality compare_nulls) {                                \
+      return cudf::conditional_left_join(left, right, binary_pred, compare_nulls);     \
+    };                                                                                 \
+    BM_join<key_type, payload_type, nullable>(st, join);                               \
+  }
+
+CONDITIONAL_LEFT_JOIN_BENCHMARK_DEFINE(conditional_left_join_32bit, int32_t, int32_t, false);
+CONDITIONAL_LEFT_JOIN_BENCHMARK_DEFINE(conditional_left_join_64bit, int64_t, int64_t, false);
+CONDITIONAL_LEFT_JOIN_BENCHMARK_DEFINE(conditional_left_join_32bit_nulls, int32_t, int32_t, true);
+CONDITIONAL_LEFT_JOIN_BENCHMARK_DEFINE(conditional_left_join_64bit_nulls, int64_t, int64_t, true);
+
+#define CONDITIONAL_FULL_JOIN_BENCHMARK_DEFINE(name, key_type, payload_type, nullable) \
+  BENCHMARK_TEMPLATE_DEFINE_F(ConditionalJoin, name, key_type, payload_type)           \
+  (::benchmark::State & st)                                                            \
+  {                                                                                    \
+    auto join = [](cudf::table_view const& left,                                       \
+                   cudf::table_view const& right,                                      \
+                   cudf::ast::expression binary_pred,                                  \
+                   cudf::null_equality compare_nulls) {                                \
+      return cudf::conditional_inner_join(left, right, binary_pred, compare_nulls);    \
+    };                                                                                 \
+    BM_join<key_type, payload_type, nullable>(st, join);                               \
+  }
+
+CONDITIONAL_FULL_JOIN_BENCHMARK_DEFINE(conditional_full_join_32bit, int32_t, int32_t, false);
+CONDITIONAL_FULL_JOIN_BENCHMARK_DEFINE(conditional_full_join_64bit, int64_t, int64_t, false);
+CONDITIONAL_FULL_JOIN_BENCHMARK_DEFINE(conditional_full_join_32bit_nulls, int32_t, int32_t, true);
+CONDITIONAL_FULL_JOIN_BENCHMARK_DEFINE(conditional_full_join_64bit_nulls, int64_t, int64_t, true);
+
+#define CONDITIONAL_LEFT_ANTI_JOIN_BENCHMARK_DEFINE(name, key_type, payload_type, nullable) \
+  BENCHMARK_TEMPLATE_DEFINE_F(ConditionalJoin, name, key_type, payload_type)                \
+  (::benchmark::State & st)                                                                 \
+  {                                                                                         \
+    auto join = [](cudf::table_view const& left,                                            \
+                   cudf::table_view const& right,                                           \
+                   cudf::ast::expression binary_pred,                                       \
+                   cudf::null_equality compare_nulls) {                                     \
+      return cudf::conditional_left_anti_join(left, right, binary_pred, compare_nulls);     \
+    };                                                                                      \
+    BM_join<key_type, payload_type, nullable>(st, join);                                    \
+  }
+
+CONDITIONAL_LEFT_ANTI_JOIN_BENCHMARK_DEFINE(conditional_left_anti_join_32bit,
+                                            int32_t,
+                                            int32_t,
+                                            false);
+CONDITIONAL_LEFT_ANTI_JOIN_BENCHMARK_DEFINE(conditional_left_anti_join_64bit,
+                                            int64_t,
+                                            int64_t,
+                                            false);
+CONDITIONAL_LEFT_ANTI_JOIN_BENCHMARK_DEFINE(conditional_left_anti_join_32bit_nulls,
+                                            int32_t,
+                                            int32_t,
+                                            true);
+CONDITIONAL_LEFT_ANTI_JOIN_BENCHMARK_DEFINE(conditional_left_anti_join_64bit_nulls,
+                                            int64_t,
+                                            int64_t,
+                                            true);
+
+#define CONDITIONAL_LEFT_SEMI_JOIN_BENCHMARK_DEFINE(name, key_type, payload_type, nullable) \
+  BENCHMARK_TEMPLATE_DEFINE_F(ConditionalJoin, name, key_type, payload_type)                \
+  (::benchmark::State & st)                                                                 \
+  {                                                                                         \
+    auto join = [](cudf::table_view const& left,                                            \
+                   cudf::table_view const& right,                                           \
+                   cudf::ast::expression binary_pred,                                       \
+                   cudf::null_equality compare_nulls) {                                     \
+      return cudf::conditional_left_semi_join(left, right, binary_pred, compare_nulls);     \
+    };                                                                                      \
+    BM_join<key_type, payload_type, nullable>(st, join);                                    \
+  }
+
+CONDITIONAL_LEFT_SEMI_JOIN_BENCHMARK_DEFINE(conditional_left_semi_join_32bit,
+                                            int32_t,
+                                            int32_t,
+                                            false);
+CONDITIONAL_LEFT_SEMI_JOIN_BENCHMARK_DEFINE(conditional_left_semi_join_64bit,
+                                            int64_t,
+                                            int64_t,
+                                            false);
+CONDITIONAL_LEFT_SEMI_JOIN_BENCHMARK_DEFINE(conditional_left_semi_join_32bit_nulls,
+                                            int32_t,
+                                            int32_t,
+                                            true);
+CONDITIONAL_LEFT_SEMI_JOIN_BENCHMARK_DEFINE(conditional_left_semi_join_64bit_nulls,
+                                            int64_t,
+                                            int64_t,
+                                            true);
+
+// inner join -----------------------------------------------------------------------
+BENCHMARK_REGISTER_F(ConditionalJoin, conditional_inner_join_32bit)
+  ->Unit(benchmark::kMillisecond)
+  ->Args({100'000, 100'000})
+  ->Args({100'000, 400'000})
+  ->Args({100'000, 1'000'000})
+  // TODO: The below benchmark is slow, but can be useful to validate that the
+  // code works for large data sets. This benchmark was used to compare to the
+  // otherwise equivalent nullable benchmark below, which has memory errors for
+  // sufficiently large data sets.
+  //->Args({1'000'000, 1'000'000})
+  ->UseManualTime();
+
+BENCHMARK_REGISTER_F(ConditionalJoin, conditional_inner_join_64bit)
+  ->Unit(benchmark::kMillisecond)
+  ->Args({100'000, 100'000})
+  ->Args({100'000, 400'000})
+  ->Args({100'000, 1'000'000})
+  ->UseManualTime();
+
+BENCHMARK_REGISTER_F(ConditionalJoin, conditional_inner_join_32bit_nulls)
+  ->Unit(benchmark::kMillisecond)
+  ->Args({100'000, 100'000})
+  ->Args({100'000, 400'000})
+  ->Args({100'000, 1'000'000})
+  ->UseManualTime();
+
+BENCHMARK_REGISTER_F(ConditionalJoin, conditional_inner_join_64bit_nulls)
+  ->Unit(benchmark::kMillisecond)
+  ->Args({100'000, 100'000})
+  ->Args({100'000, 400'000})
+  ->Args({100'000, 1'000'000})
+  ->UseManualTime();
+
+// left join -----------------------------------------------------------------------
+BENCHMARK_REGISTER_F(ConditionalJoin, conditional_left_join_32bit)
+  ->Unit(benchmark::kMillisecond)
+  ->Args({100'000, 100'000})
+  ->Args({100'000, 400'000})
+  ->Args({100'000, 1'000'000})
+  ->UseManualTime();
+
+BENCHMARK_REGISTER_F(ConditionalJoin, conditional_left_join_64bit)
+  ->Unit(benchmark::kMillisecond)
+  ->Args({100'000, 100'000})
+  ->Args({100'000, 400'000})
+  ->Args({100'000, 1'000'000})
+  ->UseManualTime();
+
+BENCHMARK_REGISTER_F(ConditionalJoin, conditional_left_join_32bit_nulls)
+  ->Unit(benchmark::kMillisecond)
+  ->Args({100'000, 100'000})
+  ->Args({100'000, 400'000})
+  ->Args({100'000, 1'000'000})
+  ->UseManualTime();
+
+BENCHMARK_REGISTER_F(ConditionalJoin, conditional_left_join_64bit_nulls)
+  ->Unit(benchmark::kMillisecond)
+  ->Args({100'000, 100'000})
+  ->Args({100'000, 400'000})
+  ->Args({100'000, 1'000'000})
+  ->UseManualTime();
+
+// full join -----------------------------------------------------------------------
+BENCHMARK_REGISTER_F(ConditionalJoin, conditional_full_join_32bit)
+  ->Unit(benchmark::kMillisecond)
+  ->Args({100'000, 100'000})
+  ->Args({100'000, 400'000})
+  ->Args({100'000, 1'000'000})
+  ->UseManualTime();
+
+BENCHMARK_REGISTER_F(ConditionalJoin, conditional_full_join_64bit)
+  ->Unit(benchmark::kMillisecond)
+  ->Args({100'000, 100'000})
+  ->Args({100'000, 400'000})
+  ->Args({100'000, 1'000'000})
+  ->UseManualTime();
+
+BENCHMARK_REGISTER_F(ConditionalJoin, conditional_full_join_32bit_nulls)
+  ->Unit(benchmark::kMillisecond)
+  ->Args({100'000, 100'000})
+  ->Args({100'000, 400'000})
+  ->Args({100'000, 1'000'000})
+  ->UseManualTime();
+
+BENCHMARK_REGISTER_F(ConditionalJoin, conditional_full_join_64bit_nulls)
+  ->Unit(benchmark::kMillisecond)
+  ->Args({100'000, 100'000})
+  ->Args({100'000, 400'000})
+  ->Args({100'000, 1'000'000})
+  ->UseManualTime();
+
+// left anti-join -------------------------------------------------------------
+BENCHMARK_REGISTER_F(ConditionalJoin, conditional_left_anti_join_32bit)
+  ->Unit(benchmark::kMillisecond)
+  ->Args({100'000, 100'000})
+  ->Args({100'000, 400'000})
+  ->Args({100'000, 1'000'000})
+  ->UseManualTime();
+
+BENCHMARK_REGISTER_F(ConditionalJoin, conditional_left_anti_join_64bit)
+  ->Unit(benchmark::kMillisecond)
+  ->Args({100'000, 100'000})
+  ->Args({100'000, 400'000})
+  ->Args({100'000, 1'000'000})
+  ->UseManualTime();
+
+BENCHMARK_REGISTER_F(ConditionalJoin, conditional_left_anti_join_32bit_nulls)
+  ->Unit(benchmark::kMillisecond)
+  ->Args({100'000, 100'000})
+  ->Args({100'000, 400'000})
+  ->Args({100'000, 1'000'000})
+  ->UseManualTime();
+
+BENCHMARK_REGISTER_F(ConditionalJoin, conditional_left_anti_join_64bit_nulls)
+  ->Unit(benchmark::kMillisecond)
+  ->Args({100'000, 100'000})
+  ->Args({100'000, 400'000})
+  ->Args({100'000, 1'000'000})
+  ->UseManualTime();
+
+// left semi-join -------------------------------------------------------------
+BENCHMARK_REGISTER_F(ConditionalJoin, conditional_left_semi_join_32bit)
+  ->Unit(benchmark::kMillisecond)
+  ->Args({100'000, 100'000})
+  ->Args({100'000, 400'000})
+  ->Args({100'000, 1'000'000})
+  ->UseManualTime();
+
+BENCHMARK_REGISTER_F(ConditionalJoin, conditional_left_semi_join_64bit)
+  ->Unit(benchmark::kMillisecond)
+  ->Args({100'000, 100'000})
+  ->Args({100'000, 400'000})
+  ->Args({100'000, 1'000'000})
+  ->UseManualTime();
+
+BENCHMARK_REGISTER_F(ConditionalJoin, conditional_left_semi_join_32bit_nulls)
+  ->Unit(benchmark::kMillisecond)
+  ->Args({100'000, 100'000})
+  ->Args({100'000, 400'000})
+  ->Args({100'000, 1'000'000})
+  ->UseManualTime();
+
+BENCHMARK_REGISTER_F(ConditionalJoin, conditional_left_semi_join_64bit_nulls)
+  ->Unit(benchmark::kMillisecond)
+  ->Args({100'000, 100'000})
+  ->Args({100'000, 400'000})
+  ->Args({100'000, 1'000'000})
+  ->UseManualTime();
diff --git a/cpp/benchmarks/join/generate_input_tables.cuh b/cpp/benchmarks/join/generate_input_tables.cuh
index 285a9241a26..d7f64716e58 100644
--- a/cpp/benchmarks/join/generate_input_tables.cuh
+++ b/cpp/benchmarks/join/generate_input_tables.cuh
@@ -141,7 +141,7 @@ __global__ void init_probe_tbl(key_type* const probe_tbl,
  * (e.g. device memory, zero copy memory or unified memory). Each value in the build table
  * will be from [0,rand_max] and if uniq_build_tbl_keys is true it is ensured that each value
  * will be uniq in the build table. Each value in the probe table will be also in the build
- * table with a propability of selectivity and a random number from
+ * table with a probability of selectivity and a random number from
  * [0,rand_max] \setminus \{build_tbl\} otherwise.
  *
  * @param[out] build_tbl            The build table to generate. Usually the smaller table used to
@@ -150,7 +150,7 @@ __global__ void init_probe_tbl(key_type* const probe_tbl,
  * @param[out] probe_tbl            The probe table to generate. Usually the larger table used to
  *                                  probe into the hash table created from the build table.
  * @param[in] build_tbl_size        number of keys in the build table
- * @param[in] selectivity           propability with which an element of the probe table is
+ * @param[in] selectivity           probability with which an element of the probe table is
  *                                  present in the build table.
  * @param[in] rand_max              maximum random number to generate. I.e. random numbers are
  *                                  integers from [0,rand_max].
@@ -169,7 +169,7 @@ void generate_input_tables(key_type* const build_tbl,
   // expense of not being that accurate with applying the selectivity an especially more memory
   // efficient implementations would be to partition the random numbers into two intervals and then
   // let one table choose random numbers from only one interval and the other only select with
-  // selectivity propability from the same interval and from the other in the other cases.
+  // selective probability from the same interval and from the other in the other cases.
 
   static_assert(std::is_signed<key_type>::value, "key_type needs to be signed for lottery to work");
 
diff --git a/cpp/benchmarks/reduction/anyall_benchmark.cpp b/cpp/benchmarks/reduction/anyall_benchmark.cpp
index 97d66585f8c..3dcb433ec52 100644
--- a/cpp/benchmarks/reduction/anyall_benchmark.cpp
+++ b/cpp/benchmarks/reduction/anyall_benchmark.cpp
@@ -48,7 +48,7 @@ void BM_reduction_anyall(benchmark::State& state, std::unique_ptr<cudf::aggregat
 }
 
 #define concat(a, b, c) a##b##c
-#define get_agg(op) concat(cudf::make_, op, _aggregation())
+#define get_agg(op)     concat(cudf::make_, op, _aggregation())
 
 // TYPE, OP
 #define RBM_BENCHMARK_DEFINE(name, type, aggregation)             \
diff --git a/cpp/benchmarks/reduction/dictionary_benchmark.cpp b/cpp/benchmarks/reduction/dictionary_benchmark.cpp
index 3622b36eb66..cb66e3744e2 100644
--- a/cpp/benchmarks/reduction/dictionary_benchmark.cpp
+++ b/cpp/benchmarks/reduction/dictionary_benchmark.cpp
@@ -53,7 +53,7 @@ void BM_reduction_dictionary(benchmark::State& state, std::unique_ptr<cudf::aggr
 }
 
 #define concat(a, b, c) a##b##c
-#define get_agg(op) concat(cudf::make_, op, _aggregation())
+#define get_agg(op)     concat(cudf::make_, op, _aggregation())
 
 // TYPE, OP
 #define RBM_BENCHMARK_DEFINE(name, type, aggregation)                       \
diff --git a/cpp/benchmarks/reduction/minmax_benchmark.cpp b/cpp/benchmarks/reduction/minmax_benchmark.cpp
index 45d041a7f8a..3b64202eef5 100644
--- a/cpp/benchmarks/reduction/minmax_benchmark.cpp
+++ b/cpp/benchmarks/reduction/minmax_benchmark.cpp
@@ -48,7 +48,7 @@ void BM_reduction(benchmark::State& state)
 }
 
 #define concat(a, b, c) a##b##c
-#define get_agg(op) concat(cudf::make_, op, _aggregation())
+#define get_agg(op)     concat(cudf::make_, op, _aggregation())
 
 // TYPE, OP
 #define RBM_BENCHMARK_DEFINE(name, type, aggregation)                                            \
diff --git a/cpp/benchmarks/reduction/reduce_benchmark.cpp b/cpp/benchmarks/reduction/reduce_benchmark.cpp
index 9368eaffbcc..0d76a4a8ba4 100644
--- a/cpp/benchmarks/reduction/reduce_benchmark.cpp
+++ b/cpp/benchmarks/reduction/reduce_benchmark.cpp
@@ -54,7 +54,7 @@ void BM_reduction(benchmark::State& state, std::unique_ptr<cudf::aggregation> co
 }
 
 #define concat(a, b, c) a##b##c
-#define get_agg(op) concat(cudf::make_, op, _aggregation())
+#define get_agg(op)     concat(cudf::make_, op, _aggregation())
 
 // TYPE, OP
 #define RBM_BENCHMARK_DEFINE(name, type, aggregation)             \
diff --git a/cpp/benchmarks/search/search_benchmark.cpp b/cpp/benchmarks/search/search_benchmark.cpp
index 7fb196fb500..c3529c7e79c 100644
--- a/cpp/benchmarks/search/search_benchmark.cpp
+++ b/cpp/benchmarks/search/search_benchmark.cpp
@@ -131,7 +131,8 @@ BENCHMARK_DEFINE_F(Search, Table)(::benchmark::State& state) { BM_table(state);
 static void CustomArguments(benchmark::internal::Benchmark* b)
 {
   for (int num_cols = 1; num_cols <= 10; num_cols *= 2)
-    for (int col_size = 1000; col_size <= 100000000; col_size *= 10) b->Args({num_cols, col_size});
+    for (int col_size = 1000; col_size <= 100000000; col_size *= 10)
+      b->Args({num_cols, col_size});
 }
 
 BENCHMARK_REGISTER_F(Search, Table)
diff --git a/cpp/benchmarks/stream_compaction/apply_boolean_mask_benchmark.cpp b/cpp/benchmarks/stream_compaction/apply_boolean_mask_benchmark.cpp
index 5cd2278ca14..7246d113ade 100644
--- a/cpp/benchmarks/stream_compaction/apply_boolean_mask_benchmark.cpp
+++ b/cpp/benchmarks/stream_compaction/apply_boolean_mask_benchmark.cpp
@@ -35,13 +35,15 @@ constexpr cudf::size_type fifty_percent = 50;
 void percent_range(benchmark::internal::Benchmark* b)
 {
   b->Unit(benchmark::kMillisecond);
-  for (int percent = 0; percent <= 100; percent += 10) b->Args({hundredM, percent});
+  for (int percent = 0; percent <= 100; percent += 10)
+    b->Args({hundredM, percent});
 }
 
 void size_range(benchmark::internal::Benchmark* b)
 {
   b->Unit(benchmark::kMillisecond);
-  for (int size = tenK; size <= hundredM; size *= 10) b->Args({size, fifty_percent});
+  for (int size = tenK; size <= hundredM; size *= 10)
+    b->Args({size, fifty_percent});
 }
 
 template <typename T>
@@ -64,9 +66,9 @@ void calculate_bandwidth(benchmark::State& state, cudf::size_type num_columns)
   cudf::size_type const column_size_out = fraction * column_size;
   int64_t const mask_size =
     sizeof(bool) * column_size + cudf::bitmask_allocation_size_bytes(column_size);
-  int64_t const validity_bytes_in = (fraction >= 1.0f / 32)
-                                      ? cudf::bitmask_allocation_size_bytes(column_size)
-                                      : 4 * column_size_out;
+  int64_t const validity_bytes_in  = (fraction >= 1.0f / 32)
+                                       ? cudf::bitmask_allocation_size_bytes(column_size)
+                                       : 4 * column_size_out;
   int64_t const validity_bytes_out = cudf::bitmask_allocation_size_bytes(column_size_out);
   int64_t const column_bytes_out   = sizeof(T) * column_size_out;
   int64_t const column_bytes_in    = column_bytes_out;  // we only read unmasked inputs
diff --git a/cpp/benchmarks/stream_compaction/drop_duplicates_benchmark.cpp b/cpp/benchmarks/stream_compaction/drop_duplicates_benchmark.cpp
index 16bae725621..8039d7d065f 100644
--- a/cpp/benchmarks/stream_compaction/drop_duplicates_benchmark.cpp
+++ b/cpp/benchmarks/stream_compaction/drop_duplicates_benchmark.cpp
@@ -50,7 +50,7 @@ void BM_compaction(benchmark::State& state, cudf::duplicate_keep_option keep)
 }
 
 #define concat(a, b, c) a##b##c
-#define get_keep(op) cudf::duplicate_keep_option::KEEP_##op
+#define get_keep(op)    cudf::duplicate_keep_option::KEEP_##op
 
 // TYPE, OP
 #define RBM_BENCHMARK_DEFINE(name, type, keep)                     \
diff --git a/cpp/benchmarks/string/extract_benchmark.cpp b/cpp/benchmarks/string/extract_benchmark.cpp
index aa1e59a22bf..161e30c6f25 100644
--- a/cpp/benchmarks/string/extract_benchmark.cpp
+++ b/cpp/benchmarks/string/extract_benchmark.cpp
@@ -48,7 +48,9 @@ static void BM_extract(benchmark::State& state, int groups)
   });
 
   std::string pattern;
-  while (static_cast<int>(pattern.size()) < groups) { pattern += "(\\d+) "; }
+  while (static_cast<int>(pattern.size()) < groups) {
+    pattern += "(\\d+) ";
+  }
 
   std::uniform_int_distribution<int> distribution(0, samples.size() - 1);
   auto elements = cudf::detail::make_counting_transform_iterator(
diff --git a/cpp/benchmarks/text/replace_benchmark.cpp b/cpp/benchmarks/text/replace_benchmark.cpp
index 8f6704ab1af..0a0e6a1667c 100644
--- a/cpp/benchmarks/text/replace_benchmark.cpp
+++ b/cpp/benchmarks/text/replace_benchmark.cpp
@@ -41,7 +41,8 @@ static void BM_replace(benchmark::State& state)
   std::default_random_engine generator;
   std::uniform_int_distribution<int> tokens_dist(0, words.size() - 1);
   std::string row;  // build a row of random tokens
-  while (static_cast<int>(row.size()) < n_length) row += words[tokens_dist(generator)];
+  while (static_cast<int>(row.size()) < n_length)
+    row += words[tokens_dist(generator)];
 
   std::uniform_int_distribution<int> position_dist(0, 16);
 
diff --git a/cpp/benchmarks/text/subword_benchmark.cpp b/cpp/benchmarks/text/subword_benchmark.cpp
index 3670fa7c9a7..2406ddd39ae 100644
--- a/cpp/benchmarks/text/subword_benchmark.cpp
+++ b/cpp/benchmarks/text/subword_benchmark.cpp
@@ -37,7 +37,8 @@ static std::string create_hash_vocab_file()
   std::vector<std::pair<int, int>> coefficients(23, {65559, 0});
   std::ofstream outfile(hash_file, std::ofstream::out);
   outfile << "1\n0\n" << coefficients.size() << "\n";
-  for (auto c : coefficients) outfile << c.first << " " << c.second << "\n";
+  for (auto c : coefficients)
+    outfile << c.first << " " << c.second << "\n";
   std::vector<uint64_t> hash_table(23, 0);
   outfile << hash_table.size() << "\n";
   hash_table[0]  = 3015668L;
@@ -45,7 +46,8 @@ static std::string create_hash_vocab_file()
   hash_table[5]  = 6358029;
   hash_table[16] = 451412625363L;
   hash_table[20] = 6206321707968235495L;
-  for (auto h : hash_table) outfile << h << "\n";
+  for (auto h : hash_table)
+    outfile << h << "\n";
   outfile << "100\n101\n102\n\n";
   return hash_file;
 }
diff --git a/cpp/benchmarks/type_dispatcher/type_dispatcher_benchmark.cu b/cpp/benchmarks/type_dispatcher/type_dispatcher_benchmark.cu
index b09a7911595..8e51bcca63d 100644
--- a/cpp/benchmarks/type_dispatcher/type_dispatcher_benchmark.cu
+++ b/cpp/benchmarks/type_dispatcher/type_dispatcher_benchmark.cu
@@ -64,7 +64,9 @@ __global__ void no_dispatching_kernel(T** A, cudf::size_type n_rows, cudf::size_
   using F               = Functor<T, functor_type>;
   cudf::size_type index = blockIdx.x * blockDim.x + threadIdx.x;
   while (index < n_rows) {
-    for (int c = 0; c < n_cols; c++) { A[c][index] = F::f(A[c][index]); }
+    for (int c = 0; c < n_cols; c++) {
+      A[c][index] = F::f(A[c][index]);
+    }
     index += blockDim.x * gridDim.x;
   }
 }
diff --git a/cpp/cmake/thirdparty/CUDF_GetArrow.cmake b/cpp/cmake/thirdparty/CUDF_GetArrow.cmake
index 0eee5abd2f3..8cef3e8b9d0 100644
--- a/cpp/cmake/thirdparty/CUDF_GetArrow.cmake
+++ b/cpp/cmake/thirdparty/CUDF_GetArrow.cmake
@@ -14,11 +14,10 @@
 # limitations under the License.
 #=============================================================================
 
-function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3)
+function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_PYTHON ENABLE_PARQUET)
 
     set(ARROW_BUILD_SHARED ON)
     set(ARROW_BUILD_STATIC OFF)
-    set(ARROW_BUILD_S3 OFF)
     set(CPMAddOrFindPackage CPMFindPackage)
 
     if(NOT ARROW_ARMV8_ARCH)
@@ -36,10 +35,23 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3)
         set(CPMAddOrFindPackage CPMAddPackage)
     endif()
 
-    if(ENABLE_S3)
-        set(ARROW_BUILD_S3 ON)
+    set(ARROW_PYTHON_OPTIONS "")
+    if(ENABLE_PYTHON)
+        list(APPEND ARROW_PYTHON_OPTIONS "ARROW_PYTHON ON")
+        # Arrow's logic to build Boost from source is busted, so we have to get it from the system.
+        list(APPEND ARROW_PYTHON_OPTIONS "BOOST_SOURCE SYSTEM")
+        # Arrow's logic to find Thrift is busted, so we have to build it from
+        # source. Why can't we use `THRIFT_SOURCE BUNDLED` you might ask?
+        # Because that's _also_ busted. The only thing that seems to is to set
+        # _all_ dependencies to bundled, then optionall un-set BOOST_SOURCE to
+        # SYSTEM.
+        list(APPEND ARROW_PYTHON_OPTIONS "ARROW_DEPENDENCY_SOURCE BUNDLED")
     endif()
 
+    # Set this so Arrow correctly finds the CUDA toolkit when the build machine
+    # does not have the CUDA driver installed. This must be an env var.
+    set(ENV{CUDA_LIB_PATH} "${CUDAToolkit_LIBRARY_DIR}/stubs")
+
     cmake_language(CALL ${CPMAddOrFindPackage}
         NAME            Arrow
         VERSION         ${VERSION}
@@ -55,7 +67,10 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3)
                         "ARROW_WITH_BACKTRACE ON"
                         "ARROW_CXXFLAGS -w"
                         "ARROW_JEMALLOC OFF"
-                        "ARROW_S3 ${ARROW_BUILD_S3}"
+                        "ARROW_S3 ${ENABLE_S3}"
+                        # e.g. needed by blazingsql-io
+                        "ARROW_PARQUET ${ENABLE_PARQUET}"
+                        ${ARROW_PYTHON_OPTIONS}
                         # Arrow modifies CMake's GLOBAL RULE_LAUNCH_COMPILE unless this is off
                         "ARROW_USE_CCACHE OFF"
                         "ARROW_ARMV8_ARCH ${ARROW_ARMV8_ARCH}"
@@ -98,13 +113,17 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3)
                  DESTINATION "${Arrow_SOURCE_DIR}/cpp/src/arrow/util")
             file(INSTALL "${Arrow_BINARY_DIR}/src/arrow/gpu/cuda_version.h"
                  DESTINATION "${Arrow_SOURCE_DIR}/cpp/src/arrow/gpu")
+            if(ENABLE_PARQUET)
+                file(INSTALL "${Arrow_BINARY_DIR}/src/parquet/parquet_version.h"
+                     DESTINATION "${Arrow_SOURCE_DIR}/cpp/src/parquet")
+            endif()
             ###
             # This shouldn't be necessary!
             #
             # Arrow populates INTERFACE_INCLUDE_DIRECTORIES for the `arrow_static`
             # and `arrow_shared` targets in FindArrow and FindArrowCUDA respectively,
             # so for static source-builds, we have to do it after-the-fact.
-            # 
+            #
             # This only works because we know exactly which components we're using.
             # Don't forget to update this list if we add more!
             ###
@@ -127,6 +146,12 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3)
 
 endfunction()
 
-set(CUDF_VERSION_Arrow 1.0.1)
+set(CUDF_VERSION_Arrow 4.0.1)
 
-find_and_configure_arrow(${CUDF_VERSION_Arrow} ${CUDF_USE_ARROW_STATIC} ${CUDF_ENABLE_ARROW_S3})
+find_and_configure_arrow(
+    ${CUDF_VERSION_Arrow}
+    ${CUDF_USE_ARROW_STATIC}
+    ${CUDF_ENABLE_ARROW_S3}
+    ${CUDF_ENABLE_ARROW_PYTHON}
+    ${CUDF_ENABLE_ARROW_PARQUET}
+)
diff --git a/cpp/cmake/thirdparty/CUDF_GetNVBench.cmake b/cpp/cmake/thirdparty/CUDF_GetNVBench.cmake
new file mode 100644
index 00000000000..09ceffb284f
--- /dev/null
+++ b/cpp/cmake/thirdparty/CUDF_GetNVBench.cmake
@@ -0,0 +1,34 @@
+#=============================================================================
+# Copyright (c) 2021, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+
+# NVBench doesn't have a public release yet
+
+function(find_and_configure_nvbench)
+
+    if(TARGET nvbench::main)
+        return()
+    endif()
+
+    CPMFindPackage(NAME nvbench
+        GIT_REPOSITORY  https://github.com/NVIDIA/nvbench.git
+        GIT_TAG         main
+        GIT_SHALLOW     TRUE
+        OPTIONS         "NVBench_ENABLE_EXAMPLES OFF"
+                        "NVBench_ENABLE_TESTING OFF")
+
+endfunction()
+
+find_and_configure_nvbench()
diff --git a/cpp/docs/DEVELOPER_GUIDE.md b/cpp/docs/DEVELOPER_GUIDE.md
index 8ec111acdb2..9ec64060847 100644
--- a/cpp/docs/DEVELOPER_GUIDE.md
+++ b/cpp/docs/DEVELOPER_GUIDE.md
@@ -470,7 +470,7 @@ libcudf, and you should not use it in new code in libcudf without careful consid
 use `rmm::device_uvector` along with the utility factories in `device_factories.hpp`. These 
 utilities enable creation of `uvector`s from host-side vectors, or creating zero-initialized
 `uvector`s, so that they are as convenient to use as `device_vector`. Avoiding `device_vector` has
-a number of benefits, as described in the folling section on `rmm::device_uvector`.
+a number of benefits, as described in the following section on `rmm::device_uvector`.
 
 #### `rmm::device_uvector<T>`
 
diff --git a/cpp/docs/TESTING.md b/cpp/docs/TESTING.md
index 2c7b62b8b6d..3c741b5d4e7 100644
--- a/cpp/docs/TESTING.md
+++ b/cpp/docs/TESTING.md
@@ -67,7 +67,7 @@ not necessary for your test fixtures to inherit from it.
 
 Example:
 ```c++
-class MyTestFiture : public cudf::test::BaseFixture {...};
+class MyTestFixture : public cudf::test::BaseFixture {...};
 ```
 
 ## Typed Tests
diff --git a/cpp/include/cudf/aggregation.hpp b/cpp/include/cudf/aggregation.hpp
index 5fab284d506..a2f59de54db 100644
--- a/cpp/include/cudf/aggregation.hpp
+++ b/cpp/include/cudf/aggregation.hpp
@@ -67,8 +67,9 @@ class aggregation {
     ALL,             ///< all reduction
     SUM_OF_SQUARES,  ///< sum of squares reduction
     MEAN,            ///< arithmetic mean reduction
-    VARIANCE,        ///< groupwise variance
-    STD,             ///< groupwise standard deviation
+    M2,              ///< sum of squares of differences from the mean
+    VARIANCE,        ///< variance
+    STD,             ///< standard deviation
     MEDIAN,          ///< median reduction
     QUANTILE,        ///< compute specified quantile(s)
     ARGMAX,          ///< Index of max element
@@ -78,12 +79,13 @@ class aggregation {
     ROW_NUMBER,      ///< get row-number of current index (relative to rolling window)
     COLLECT_LIST,    ///< collect values into a list
     COLLECT_SET,     ///< collect values into a list without duplicate entries
-    MERGE_LISTS,     ///< merge multiple lists values into one list
-    MERGE_SETS,      ///< merge multiple lists values into one list then drop duplicate entries
     LEAD,            ///< window function, accesses row at specified offset following current row
     LAG,             ///< window function, accesses row at specified offset preceding current row
     PTX,             ///< PTX  UDF based reduction
-    CUDA             ///< CUDA UDF based reduction
+    CUDA,            ///< CUDA UDF based reduction
+    MERGE_LISTS,     ///< merge multiple lists values into one list
+    MERGE_SETS,      ///< merge multiple lists values into one list then drop duplicate entries
+    MERGE_M2         ///< merge partial values of M2 aggregation
   };
 
   aggregation() = delete;
@@ -159,6 +161,20 @@ std::unique_ptr<Base> make_sum_of_squares_aggregation();
 template <typename Base = aggregation>
 std::unique_ptr<Base> make_mean_aggregation();
 
+/**
+ * @brief Factory to create a M2 aggregation
+ *
+ * A M2 aggregation is sum of squares of differences from the mean. That is:
+ *  `M2 = SUM((x - MEAN) * (x - MEAN))`.
+ *
+ * This aggregation produces the intermediate values that are used to compute variance and standard
+ * deviation across multiple discrete sets. See
+ * `https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm` for more
+ * detail.
+ */
+template <typename Base = aggregation>
+std::unique_ptr<Base> make_m2_aggregation();
+
 /**
  * @brief Factory to create a VARIANCE aggregation
  *
@@ -271,11 +287,33 @@ std::unique_ptr<Base> make_collect_set_aggregation(null_policy null_handling = n
                                                    null_equality nulls_equal = null_equality::EQUAL,
                                                    nan_equality nans_equal = nan_equality::UNEQUAL);
 
+/// Factory to create a LAG aggregation
+template <typename Base = aggregation>
+std::unique_ptr<Base> make_lag_aggregation(size_type offset);
+
+/// Factory to create a LEAD aggregation
+template <typename Base = aggregation>
+std::unique_ptr<Base> make_lead_aggregation(size_type offset);
+
+/**
+ * @brief Factory to create an aggregation base on UDF for PTX or CUDA
+ *
+ * @param[in] type: either udf_type::PTX or udf_type::CUDA
+ * @param[in] user_defined_aggregator A string containing the aggregator code
+ * @param[in] output_type expected output type
+ *
+ * @return aggregation unique pointer housing user_defined_aggregator string.
+ */
+template <typename Base = aggregation>
+std::unique_ptr<Base> make_udf_aggregation(udf_type type,
+                                           std::string const& user_defined_aggregator,
+                                           data_type output_type);
+
 /**
  * @brief Factory to create a MERGE_LISTS aggregation.
  *
  * Given a lists column, this aggregation merges all the lists corresponding to the same key value
- * into one list. It is designed specificly to merge the partial results of multiple (distributed)
+ * into one list. It is designed specifically to merge the partial results of multiple (distributed)
  * groupby `COLLECT_LIST` aggregations into a final `COLLECT_LIST` result. As such, it requires the
  * input lists column to be non-nullable (the child column containing list entries is not subjected
  * to this requirement).
@@ -290,7 +328,7 @@ std::unique_ptr<Base> make_merge_lists_aggregation();
  * value into one list, then it drops all the duplicate entries in each lists, producing a lists
  * column containing non-repeated entries.
  *
- * This aggregation is designed specificly to merge the partial results of multiple (distributed)
+ * This aggregation is designed specifically to merge the partial results of multiple (distributed)
  * groupby `COLLECT_LIST` or `COLLECT_SET` aggregations into a final `COLLECT_SET` result. As such,
  * it requires the input lists column to be non-nullable (the child column containing list entries
  * is not subjected to this requirement).
@@ -308,27 +346,20 @@ template <typename Base = aggregation>
 std::unique_ptr<Base> make_merge_sets_aggregation(null_equality nulls_equal = null_equality::EQUAL,
                                                   nan_equality nans_equal = nan_equality::UNEQUAL);
 
-/// Factory to create a LAG aggregation
-template <typename Base = aggregation>
-std::unique_ptr<Base> make_lag_aggregation(size_type offset);
-
-/// Factory to create a LEAD aggregation
-template <typename Base = aggregation>
-std::unique_ptr<Base> make_lead_aggregation(size_type offset);
-
 /**
- * @brief Factory to create an aggregation base on UDF for PTX or CUDA
+ * @brief Factory to create a MERGE_M2 aggregation
  *
- * @param[in] type: either udf_type::PTX or udf_type::CUDA
- * @param[in] user_defined_aggregator A string containing the aggregator code
- * @param[in] output_type expected output type
+ * Merges the results of `M2` aggregations on independent sets into a new `M2` value equivalent to
+ * if a single `M2` aggregation was done across all of the sets at once. This aggregation is only
+ * valid on structs whose members are the result of the `COUNT_VALID`, `MEAN`, and `M2` aggregations
+ * on the same sets. The output of this aggregation is a struct containing the merged `COUNT_VALID`,
+ * `MEAN`, and `M2` aggregations.
  *
- * @return aggregation unique pointer housing user_defined_aggregator string.
+ * The input `M2` aggregation values are expected to be all non-negative numbers, since they
+ * were output from `M2` aggregation.
  */
 template <typename Base = aggregation>
-std::unique_ptr<Base> make_udf_aggregation(udf_type type,
-                                           std::string const& user_defined_aggregator,
-                                           data_type output_type);
+std::unique_ptr<Base> make_merge_m2_aggregation();
 
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/ast/detail/linearizer.hpp b/cpp/include/cudf/ast/detail/linearizer.hpp
index 166a0408703..67474e08877 100644
--- a/cpp/include/cudf/ast/detail/linearizer.hpp
+++ b/cpp/include/cudf/ast/detail/linearizer.hpp
@@ -103,10 +103,24 @@ class linearizer {
   /**
    * @brief Construct a new linearizer object
    *
+   * @param expr The expression to create an evaluable linearizer for.
+   * @param left The left table used for evaluating the abstract syntax tree.
+   * @param right The right table used for evaluating the abstract syntax tree.
+   */
+  linearizer(detail::node const& expr, cudf::table_view left, cudf::table_view right)
+    : _left(left), _right(right), _node_count(0), _intermediate_counter()
+  {
+    expr.accept(*this);
+  }
+
+  /**
+   * @brief Construct a new linearizer object
+   *
+   * @param expr The expression to create an evaluable linearizer for.
    * @param table The table used for evaluating the abstract syntax tree.
    */
   linearizer(detail::node const& expr, cudf::table_view table)
-    : _table(table), _node_count(0), _intermediate_counter()
+    : _left(table), _right(table), _node_count(0), _intermediate_counter()
   {
     expr.accept(*this);
   }
@@ -217,7 +231,8 @@ class linearizer {
   cudf::size_type add_data_reference(detail::device_data_reference data_ref);
 
   // State information about the "linearized" GPU execution plan
-  cudf::table_view _table;
+  cudf::table_view const& _left;
+  cudf::table_view const& _right;
   cudf::size_type _node_count;
   intermediate_counter _intermediate_counter;
   std::vector<detail::device_data_reference> _data_references;
diff --git a/cpp/include/cudf/ast/detail/transform.cuh b/cpp/include/cudf/ast/detail/transform.cuh
index f69927a3601..e56b4fb2281 100644
--- a/cpp/include/cudf/ast/detail/transform.cuh
+++ b/cpp/include/cudf/ast/detail/transform.cuh
@@ -31,6 +31,8 @@
 
 #include <rmm/cuda_stream_view.hpp>
 
+#include <thrust/optional.h>
+
 #include <cstring>
 #include <numeric>
 
@@ -40,132 +42,375 @@ namespace ast {
 
 namespace detail {
 
-// Forward declaration
-struct row_evaluator;
+// Type trait for wrapping nullable types in a thrust::optional. Non-nullable
+// types are returned as is.
+template <typename T, bool has_nulls>
+struct possibly_null_value;
 
-struct row_output {
- public:
-  __device__ row_output(row_evaluator const& evaluator) : evaluator(evaluator) {}
+template <typename T>
+struct possibly_null_value<T, true> {
+  using type = thrust::optional<T>;
+};
+
+template <typename T>
+struct possibly_null_value<T, false> {
+  using type = T;
+};
+
+template <typename T, bool has_nulls>
+using possibly_null_value_t = typename possibly_null_value<T, has_nulls>::type;
+
+// Type used for intermediate storage in expression evaluation.
+template <bool has_nulls>
+using IntermediateDataType = possibly_null_value_t<std::int64_t, has_nulls>;
 
+/**
+ * @brief A container for capturing the output of an evaluated expression.
+ *
+ * This class is designed to be passed by reference as the first argument to
+ * expression_evaluator::evaluate. The API is designed such that template
+ * specializations for specific output types will be able to customize setting
+ * behavior if necessary. The class leverages CRTP to define a suitable interface
+ * for the `expression_evaluator` at compile-time and enforce this API on its
+ * subclasses to get around the lack of device-side polymorphism.
+ *
+ * @tparam Subclass The subclass to dispatch methods to.
+ * @tparam T The underlying data type.
+ * @tparam has_nulls Whether or not the result data is nullable.
+ */
+template <typename Subclass, typename T, bool has_nulls>
+struct expression_result {
   /**
-   * @brief Resolves an output data reference and assigns result value.
-   *
-   * Only output columns (COLUMN) and intermediates (INTERMEDIATE) are supported as output reference
-   * types. Intermediates must be of fixed width less than or equal to sizeof(std::int64_t). This
-   * requirement on intermediates is enforced by the linearizer.
-   *
-   * @tparam Element Type of result element.
-   * @param device_data_reference Data reference to resolve.
-   * @param row_index Row index of data column.
-   * @param result Value to assign to output.
+   * Helper function to get the subclass type to dispatch methods to.
    */
-  template <typename Element, CUDF_ENABLE_IF(is_rep_layout_compatible<Element>())>
-  __device__ void resolve_output(detail::device_data_reference device_data_reference,
-                                 cudf::size_type row_index,
-                                 Element result) const;
-  // Definition below after row_evaluator is a complete type
-
-  template <typename Element, CUDF_ENABLE_IF(not is_rep_layout_compatible<Element>())>
-  __device__ void resolve_output(detail::device_data_reference device_data_reference,
-                                 cudf::size_type row_index,
-                                 Element result) const
+  Subclass& subclass() { return static_cast<Subclass&>(*this); }
+  Subclass const& subclass() const { return static_cast<Subclass const&>(*this); }
+
+  // TODO: The index is ignored by the value subclass, but is included in this
+  // signature because it is required by the implementation in the template
+  // specialization for column views. It would be nice to clean this up, see
+  // the related TODO below. Note that storing the index in the class on
+  // construction (which would result in a cleaner delineation of the API for
+  // the derived types) results in a significant performance penalty because
+  // the index is pushed down the memory hierarchy by the time it needs to be
+  // used, whereas passing it as a parameter keeps it in registers for fast
+  // access at the point where indexing occurs.
+  template <typename Element>
+  __device__ void set_value(cudf::size_type index, possibly_null_value_t<Element, has_nulls> result)
   {
-    cudf_assert(false && "Invalid type in resolve_output.");
+    subclass()->set_value();
   }
 
- private:
-  row_evaluator const& evaluator;
+  __device__ bool is_valid() const { subclass()->is_valid(); }
+
+  __device__ T value() const { subclass()->value(); }
 };
 
-template <typename Input>
-struct unary_row_output : public row_output {
-  __device__ unary_row_output(row_evaluator const& evaluator) : row_output(evaluator) {}
+/**
+ * @brief A container for capturing the output of an evaluated expression in a scalar.
+ *
+ * This subclass of `expression_result` functions as an owning container of a
+ * (possibly nullable) scalar type that can be written to by the
+ * expression_evaluator. The data (and its validity) can then be accessed.
+ *
+ * @tparam T The underlying data type.
+ * @tparam has_nulls Whether or not the result data is nullable.
+ */
+template <typename T, bool has_nulls>
+struct value_expression_result
+  : public expression_result<value_expression_result<T, has_nulls>, T, has_nulls> {
+  __device__ value_expression_result() {}
 
-  template <
-    ast_operator op,
-    std::enable_if_t<detail::is_valid_unary_op<detail::operator_functor<op>, Input>>* = nullptr>
-  __device__ void operator()(cudf::size_type row_index,
-                             Input input,
-                             detail::device_data_reference output) const
+  template <typename Element>
+  __device__ void set_value(cudf::size_type index, possibly_null_value_t<Element, has_nulls> result)
   {
-    using OperatorFunctor = detail::operator_functor<op>;
-    using Out             = cuda::std::invoke_result_t<OperatorFunctor, Input>;
-    resolve_output<Out>(output, row_index, OperatorFunctor{}(input));
+    if constexpr (std::is_same_v<Element, T>) {
+      _obj = result;
+    } else {
+      cudf_assert(false && "Output type does not match container type.");
+    }
   }
 
-  template <
-    ast_operator op,
-    std::enable_if_t<!detail::is_valid_unary_op<detail::operator_functor<op>, Input>>* = nullptr>
-  __device__ void operator()(cudf::size_type row_index,
-                             Input input,
-                             detail::device_data_reference output) const
+  /**
+   * @brief Returns true if the underlying data is valid and false otherwise.
+   */
+  __device__ bool is_valid() const
   {
-    cudf_assert(false && "Invalid unary dispatch operator for the provided input.");
+    if constexpr (has_nulls) { return _obj.has_value(); }
+    return true;
   }
+
+  /**
+   * @brief Returns the underlying data.
+   *
+   * @throws thrust::bad_optional_access if the underlying data is not valid.
+   */
+  __device__ T value() const
+  {
+    // Using two separate constexprs silences compiler warnings, whereas an
+    // if/else does not. An unconditional return is not ignored by the compiler
+    // when has_nulls is true and therefore raises a compiler error.
+    if constexpr (has_nulls) { return _obj.value(); }
+    if constexpr (!has_nulls) { return _obj; }
+  }
+
+  possibly_null_value_t<T, has_nulls>
+    _obj;  ///< The underlying data value, or a nullable version of it.
 };
 
-template <typename LHS, typename RHS>
-struct binary_row_output : public row_output {
-  __device__ binary_row_output(row_evaluator const& evaluator) : row_output(evaluator) {}
-
-  template <
-    ast_operator op,
-    std::enable_if_t<detail::is_valid_binary_op<detail::operator_functor<op>, LHS, RHS>>* = nullptr>
-  __device__ void operator()(cudf::size_type row_index,
-                             LHS lhs,
-                             RHS rhs,
-                             detail::device_data_reference output) const
+// TODO: The below implementation significantly differs from the default
+// implementation above due to the non-owning nature of the container and the
+// usage of the index. It would be ideal to unify these further if possible.
+
+/**
+ * @brief A container for capturing the output of an evaluated expression in a column.
+ *
+ * This subclass of `expression_result` functions as a non-owning container
+ * that transparently passes calls through to an underlying mutable view to a
+ * column. Not all methods are implemented
+ *
+ * @tparam has_nulls Whether or not the result data is nullable.
+ */
+template <bool has_nulls>
+struct mutable_column_expression_result
+  : public expression_result<mutable_column_expression_result<has_nulls>,
+                             mutable_column_device_view,
+                             has_nulls> {
+  __device__ mutable_column_expression_result(mutable_column_device_view& obj) : _obj(obj) {}
+
+  template <typename Element>
+  __device__ void set_value(cudf::size_type index, possibly_null_value_t<Element, has_nulls> result)
+  {
+    if constexpr (has_nulls) {
+      if (result.has_value()) {
+        _obj.template element<Element>(index) = *result;
+        _obj.set_valid(index);
+      } else {
+        _obj.set_null(index);
+      }
+    } else {
+      _obj.template element<Element>(index) = result;
+    }
+  }
+
+  /**
+   * @brief Not implemented for this specialization.
+   */
+  __device__ bool is_valid() const
   {
-    using OperatorFunctor = detail::operator_functor<op>;
-    using Out             = cuda::std::invoke_result_t<OperatorFunctor, LHS, RHS>;
-    resolve_output<Out>(output, row_index, OperatorFunctor{}(lhs, rhs));
+    // Not implemented since it would require modifying the API in the parent class to accept an
+    // index.
+    cudf_assert(false && "This method is not implemented.");
   }
 
-  template <ast_operator op,
-            std::enable_if_t<!detail::is_valid_binary_op<detail::operator_functor<op>, LHS, RHS>>* =
-              nullptr>
-  __device__ void operator()(cudf::size_type row_index,
-                             LHS lhs,
-                             RHS rhs,
-                             detail::device_data_reference output) const
+  /**
+   * @brief Not implemented for this specialization.
+   */
+  __device__ mutable_column_device_view value() const
   {
-    cudf_assert(false && "Invalid binary dispatch operator for the provided input.");
+    // Not implemented since it would require modifying the API in the parent class to accept an
+    // index.
+    cudf_assert(false && "This method is not implemented.");
   }
+
+  mutable_column_device_view& _obj;  ///< The column to which the data is written.
 };
 
 /**
- * @brief An expression evaluator owned by a single thread operating on rows of a table.
+ * @brief A container of all device data required to evaluate an expression on tables.
+ *
+ * This struct should never be instantiated directly. It is created by the
+ * `ast_plan` on construction, and the resulting member is publicly accessible
+ * for passing to kernels for constructing an `expression_evaluator`.
  *
- * This class is designed for n-ary transform evaluation. Currently this class assumes that there's
- * only one relevant "row index" in its methods, which corresponds to a row in a single input table
- * and the same row index in an output column.
  */
-struct row_evaluator {
-  friend struct row_output;
-  template <typename Input>
-  friend struct unary_row_output;
-  template <typename LHS, typename RHS>
-  friend struct binary_row_output;
+struct device_ast_plan {
+  device_span<const detail::device_data_reference> data_references;
+  device_span<const cudf::detail::fixed_width_scalar_device_view_base> literals;
+  device_span<const ast_operator> operators;
+  device_span<const cudf::size_type> operator_source_indices;
+  cudf::size_type num_intermediates;
+  int shmem_per_thread;
+};
+
+/**
+ * @brief Preprocessor for an expression acting on tables to generate data suitable for AST
+ * expression evaluation on the GPU.
+ *
+ * On construction, an AST plan creates a single "packed" host buffer of all
+ * data arrays that will be necessary to evaluate an expression on a pair of
+ * tables. This data is copied to a single contiguous device buffer, and
+ * pointers are generated to the individual components. Because the plan tends
+ * to be small, this is the most efficient approach for low latency. All the
+ * data required on the GPU can be accessed via the convenient `dev_plan`
+ * member struct, which can be used to construct an `expression_evaluator` on
+ * the device.
+ *
+ * Note that the resulting device data cannot be used once this class goes out of scope.
+ */
+struct ast_plan {
+  /**
+   * @brief Construct an AST plan for an expression operating on two tables.
+   *
+   * @param expr The expression for which to construct a plan.
+   * @param left The left table on which the expression acts.
+   * @param right The right table on which the expression acts.
+   * @param has_nulls Boolean indicator of whether or not the data contains nulls.
+   * @param stream Stream view on which to allocate resources and queue execution.
+   * @param mr Device memory resource used to allocate the returned column's device.
+   */
+  ast_plan(detail::node const& expr,
+           cudf::table_view left,
+           cudf::table_view right,
+           bool has_nulls,
+           rmm::cuda_stream_view stream,
+           rmm::mr::device_memory_resource* mr)
+    : _linearizer(expr, left, right)
+  {
+    std::vector<cudf::size_type> sizes;
+    std::vector<const void*> data_pointers;
+
+    extract_size_and_pointer(_linearizer.data_references(), sizes, data_pointers);
+    extract_size_and_pointer(_linearizer.literals(), sizes, data_pointers);
+    extract_size_and_pointer(_linearizer.operators(), sizes, data_pointers);
+    extract_size_and_pointer(_linearizer.operator_source_indices(), sizes, data_pointers);
+
+    // Create device buffer
+    auto const buffer_size = std::accumulate(sizes.cbegin(), sizes.cend(), 0);
+    auto buffer_offsets    = std::vector<int>(sizes.size());
+    thrust::exclusive_scan(sizes.cbegin(), sizes.cend(), buffer_offsets.begin(), 0);
+
+    auto h_data_buffer = std::make_unique<char[]>(buffer_size);
+    for (unsigned int i = 0; i < data_pointers.size(); ++i) {
+      std::memcpy(h_data_buffer.get() + buffer_offsets[i], data_pointers[i], sizes[i]);
+    }
+
+    _device_data_buffer = rmm::device_buffer(h_data_buffer.get(), buffer_size, stream, mr);
 
+    stream.synchronize();
+
+    // Create device pointers to components of plan
+    auto device_data_buffer_ptr = static_cast<const char*>(_device_data_buffer.data());
+    dev_plan.data_references    = device_span<const detail::device_data_reference>(
+      reinterpret_cast<const detail::device_data_reference*>(device_data_buffer_ptr +
+                                                             buffer_offsets[0]),
+      _linearizer.data_references().size());
+    dev_plan.literals = device_span<const cudf::detail::fixed_width_scalar_device_view_base>(
+      reinterpret_cast<const cudf::detail::fixed_width_scalar_device_view_base*>(
+        device_data_buffer_ptr + buffer_offsets[1]),
+      _linearizer.literals().size());
+    dev_plan.operators = device_span<const ast_operator>(
+      reinterpret_cast<const ast_operator*>(device_data_buffer_ptr + buffer_offsets[2]),
+      _linearizer.operators().size());
+    dev_plan.operator_source_indices = device_span<const cudf::size_type>(
+      reinterpret_cast<const cudf::size_type*>(device_data_buffer_ptr + buffer_offsets[3]),
+      _linearizer.operator_source_indices().size());
+    dev_plan.num_intermediates = _linearizer.intermediate_count();
+    dev_plan.shmem_per_thread  = static_cast<int>(
+      (has_nulls ? sizeof(IntermediateDataType<true>) : sizeof(IntermediateDataType<false>)) *
+      dev_plan.num_intermediates);
+  }
+
+  /**
+   * @brief Construct an AST plan for an expression operating on one table.
+   *
+   * @param expr The expression for which to construct a plan.
+   * @param table The table on which the expression acts.
+   * @param has_nulls Boolean indicator of whether or not the data contains nulls.
+   * @param stream Stream view on which to allocate resources and queue execution.
+   * @param mr Device memory resource used to allocate the returned column's device.
+   */
+  ast_plan(detail::node const& expr,
+           cudf::table_view table,
+           bool has_nulls,
+           rmm::cuda_stream_view stream,
+           rmm::mr::device_memory_resource* mr)
+    : ast_plan(expr, table, table, has_nulls, stream, mr)
+  {
+  }
+
+  cudf::data_type output_type() const { return _linearizer.root_data_type(); }
+
+  device_ast_plan
+    dev_plan;  ///< The collection of data required to evaluate the expression on the device.
+
+ private:
+  /**
+   * @brief Helper function for adding components (operators, literals, etc) to AST plan
+   *
+   * @tparam T  The underlying type of the input `std::vector`
+   * @param[in]  v  The `std::vector` containing components (operators, literals, etc).
+   * @param[in,out]  sizes  The `std::vector` containing the size of each data buffer.
+   * @param[in,out]  data_pointers  The `std::vector` containing pointers to each data buffer.
+   */
+  template <typename T>
+  void extract_size_and_pointer(std::vector<T> const& v,
+                                std::vector<cudf::size_type>& sizes,
+                                std::vector<const void*>& data_pointers)
+  {
+    auto const data_size = sizeof(T) * v.size();
+    sizes.push_back(data_size);
+    data_pointers.push_back(v.data());
+  }
+
+  rmm::device_buffer
+    _device_data_buffer;  ///< The device-side data buffer containing the plan information, which is
+                          ///< owned by this class and persists until it is destroyed.
+  linearizer const _linearizer;  ///< The linearizer created from the provided expression that is
+                                 ///< used to construct device-side operators and references.
+};
+
+/**
+ * @brief The principal object for evaluating AST expressions on device.
+ *
+ * This class is designed for n-ary transform evaluation. It operates on two
+ * tables.
+ */
+template <bool has_nulls>
+struct expression_evaluator {
  public:
   /**
-   * @brief Construct a row evaluator.
+   * @brief Construct an expression evaluator acting on two tables.
+   *
+   * @param left View of the left table view used for evaluation.
+   * @param right View of the right table view used for evaluation.
+   * @param plan The collection of device references representing the expression to evaluate.
+   * @param thread_intermediate_storage Pointer to this thread's portion of shared memory for
+   * storing intermediates.
+   * @param compare_nulls Whether the equality operator returns true or false for two nulls.
+
+   */
+  __device__ expression_evaluator(table_device_view const& left,
+                                  table_device_view const& right,
+                                  device_ast_plan const& plan,
+                                  IntermediateDataType<has_nulls>* thread_intermediate_storage,
+                                  null_equality compare_nulls = null_equality::EQUAL)
+    : left(left),
+      right(right),
+      plan(plan),
+      thread_intermediate_storage(thread_intermediate_storage),
+      compare_nulls(compare_nulls)
+  {
+  }
+
+  /**
+   * @brief Construct an expression evaluator acting on one table.
    *
-   * @param table The table device view used for evaluation.
-   * @param literals Array of literal values used for evaluation.
+   * @param table View of the table view used for evaluation.
+   * @param plan The collection of device references representing the expression to evaluate.
    * @param thread_intermediate_storage Pointer to this thread's portion of shared memory for
    * storing intermediates.
-   * @param output_column The output column where results are stored.
+   * @param compare_nulls Whether the equality operator returns true or false for two nulls.
    */
-  __device__ row_evaluator(
-    table_device_view const& table,
-    device_span<const cudf::detail::fixed_width_scalar_device_view_base> literals,
-    std::int64_t* thread_intermediate_storage,
-    mutable_column_device_view* output_column)
-    : table(table),
-      literals(literals),
+  __device__ expression_evaluator(table_device_view const& table,
+                                  device_ast_plan const& plan,
+                                  IntermediateDataType<has_nulls>* thread_intermediate_storage,
+                                  null_equality compare_nulls = null_equality::EQUAL)
+    : left(table),
+      right(table),
+      plan(plan),
       thread_intermediate_storage(thread_intermediate_storage),
-      output_column(output_column)
+      compare_nulls(compare_nulls)
   {
   }
 
@@ -177,241 +422,437 @@ struct row_evaluator {
    * sizeof(std::int64_t). This requirement on intermediates is enforced by the linearizer.
    *
    * @tparam Element Type of element to return.
+   * @tparam has_nulls Whether or not the result data is nullable.
    * @param device_data_reference Data reference to resolve.
    * @param row_index Row index of data column.
-   * @return Element
+   * @return Element The type- and null-resolved data.
    */
   template <typename Element, CUDF_ENABLE_IF(column_device_view::has_element_accessor<Element>())>
-  __device__ Element resolve_input(detail::device_data_reference device_data_reference,
-                                   cudf::size_type row_index) const
+  __device__ possibly_null_value_t<Element, has_nulls> resolve_input(
+    detail::device_data_reference device_data_reference, cudf::size_type row_index) const
   {
     auto const data_index = device_data_reference.data_index;
     auto const ref_type   = device_data_reference.reference_type;
+    // TODO: Everywhere in the code assumes that the table reference is either
+    // left or right. Should we error-check somewhere to prevent
+    // table_reference::OUTPUT from being specified?
+    auto const& table = device_data_reference.table_source == table_reference::LEFT ? left : right;
+    using ReturnType  = possibly_null_value_t<Element, has_nulls>;
     if (ref_type == detail::device_data_reference_type::COLUMN) {
-      return table.column(data_index).element<Element>(row_index);
+      // If we have nullable data, return an empty nullable type with no value if the data is null.
+      if constexpr (has_nulls) {
+        return table.column(data_index).is_valid(row_index)
+                 ? ReturnType(table.column(data_index).element<Element>(row_index))
+                 : ReturnType();
+
+      } else {
+        return ReturnType(table.column(data_index).element<Element>(row_index));
+      }
     } else if (ref_type == detail::device_data_reference_type::LITERAL) {
-      return literals[data_index].value<Element>();
+      return ReturnType(plan.literals[data_index].value<Element>());
     } else {  // Assumes ref_type == detail::device_data_reference_type::INTERMEDIATE
       // Using memcpy instead of reinterpret_cast<Element*> for safe type aliasing
       // Using a temporary variable ensures that the compiler knows the result is aligned
-      std::int64_t intermediate = thread_intermediate_storage[data_index];
-      Element tmp;
-      memcpy(&tmp, &intermediate, sizeof(Element));
+      IntermediateDataType<has_nulls> intermediate = thread_intermediate_storage[data_index];
+      ReturnType tmp;
+      memcpy(&tmp, &intermediate, sizeof(ReturnType));
       return tmp;
     }
+    // Unreachable return used to silence compiler warnings.
+    return {};
   }
 
   template <typename Element,
             CUDF_ENABLE_IF(not column_device_view::has_element_accessor<Element>())>
-  __device__ Element resolve_input(detail::device_data_reference device_data_reference,
-                                   cudf::size_type row_index) const
+  __device__ possibly_null_value_t<Element, has_nulls> resolve_input(
+    detail::device_data_reference device_data_reference, cudf::size_type row_index) const
   {
     cudf_assert(false && "Unsupported type in resolve_input.");
+    // Unreachable return used to silence compiler warnings.
     return {};
   }
 
   /**
    * @brief Callable to perform a unary operation.
    *
-   * @tparam OperatorFunctor Functor that performs desired operation when `operator()` is called.
    * @tparam Input Type of input value.
-   * @param row_index Row index of data column(s).
+   * @tparam OutputType The container type that data will be inserted into.
+   *
+   * @param output_object The container that data will be inserted into.
+   * @param input_row_index The row to pull the data from the input table.
    * @param input Input data reference.
    * @param output Output data reference.
+   * @param output_row_index The row in the output to insert the result.
+   * @param op The operator to act with.
    */
-  template <typename Input>
-  __device__ void operator()(cudf::size_type row_index,
-                             detail::device_data_reference input,
-                             detail::device_data_reference output,
-                             ast_operator op) const
+  template <typename Input, typename OutputType>
+  __device__ void operator()(OutputType& output_object,
+                             const cudf::size_type input_row_index,
+                             const detail::device_data_reference input,
+                             const detail::device_data_reference output,
+                             const cudf::size_type output_row_index,
+                             const ast_operator op) const
   {
-    auto const typed_input = resolve_input<Input>(input, row_index);
-    ast_operator_dispatcher(op, unary_row_output<Input>(*this), row_index, typed_input, output);
+    auto const typed_input = resolve_input<Input>(input, input_row_index);
+    ast_operator_dispatcher(op,
+                            unary_expression_output_handler<Input>(*this),
+                            output_object,
+                            output_row_index,
+                            typed_input,
+                            output);
   }
 
   /**
-   * @brief Callable to perform a binary operation.
+   * @brief Callable to perform a unary operation.
+   *
+   * @tparam LHS Type of the left input value.
+   * @tparam RHS Type of the right input value.
+   * @tparam OutputType The container type that data will be inserted into.
    *
-   * @tparam OperatorFunctor Functor that performs desired operation when `operator()` is called.
-   * @tparam LHS Type of left input value.
-   * @tparam RHS Type of right input value.
-   * @param row_index Row index of data column(s).
+   * @param output_object The container that data will be inserted into.
+   * @param left_row_index The row to pull the data from the left table.
+   * @param right_row_index The row to pull the data from the right table.
    * @param lhs Left input data reference.
    * @param rhs Right input data reference.
    * @param output Output data reference.
+   * @param output_row_index The row in the output to insert the result.
+   * @param op The operator to act with.
    */
-  template <typename LHS, typename RHS>
-  __device__ void operator()(cudf::size_type row_index,
-                             detail::device_data_reference lhs,
-                             detail::device_data_reference rhs,
-                             detail::device_data_reference output,
-                             ast_operator op) const
+  template <typename LHS, typename RHS, typename OutputType>
+  __device__ void operator()(OutputType& output_object,
+                             const cudf::size_type left_row_index,
+                             const cudf::size_type right_row_index,
+                             const detail::device_data_reference lhs,
+                             const detail::device_data_reference rhs,
+                             const detail::device_data_reference output,
+                             const cudf::size_type output_row_index,
+                             const ast_operator op) const
   {
-    auto const typed_lhs = resolve_input<LHS>(lhs, row_index);
-    auto const typed_rhs = resolve_input<RHS>(rhs, row_index);
-    ast_operator_dispatcher(
-      op, binary_row_output<LHS, RHS>(*this), row_index, typed_lhs, typed_rhs, output);
+    auto const typed_lhs = resolve_input<LHS>(lhs, left_row_index);
+    auto const typed_rhs = resolve_input<RHS>(rhs, right_row_index);
+    ast_operator_dispatcher(op,
+                            binary_expression_output_handler<LHS, RHS>(*this),
+                            output_object,
+                            output_row_index,
+                            typed_lhs,
+                            typed_rhs,
+                            output);
   }
 
   template <typename OperatorFunctor,
             typename LHS,
             typename RHS,
+            typename OutputType,
             std::enable_if_t<!detail::is_valid_binary_op<OperatorFunctor, LHS, RHS>>* = nullptr>
-  __device__ void operator()(cudf::size_type row_index,
-                             detail::device_data_reference lhs,
-                             detail::device_data_reference rhs,
-                             detail::device_data_reference output) const
+  __device__ void operator()(OutputType& output_object,
+                             cudf::size_type left_row_index,
+                             cudf::size_type right_row_index,
+                             const detail::device_data_reference lhs,
+                             const detail::device_data_reference rhs,
+                             const detail::device_data_reference output,
+                             cudf::size_type output_row_index,
+                             const ast_operator op) const
   {
     cudf_assert(false && "Invalid binary dispatch operator for the provided input.");
   }
 
- private:
-  table_device_view const& table;
-  device_span<const cudf::detail::fixed_width_scalar_device_view_base> literals;
-  std::int64_t* thread_intermediate_storage;
-  mutable_column_device_view* output_column;
-};
-
-template <typename Element, std::enable_if_t<is_rep_layout_compatible<Element>()>*>
-__device__ void row_output::resolve_output(detail::device_data_reference device_data_reference,
-                                           cudf::size_type row_index,
-                                           Element result) const
-{
-  auto const ref_type = device_data_reference.reference_type;
-  if (ref_type == detail::device_data_reference_type::COLUMN) {
-    evaluator.output_column->element<Element>(row_index) = result;
-  } else {  // Assumes ref_type == detail::device_data_reference_type::INTERMEDIATE
-    // Using memcpy instead of reinterpret_cast<Element*> for safe type aliasing.
-    // Using a temporary variable ensures that the compiler knows the result is aligned.
-    std::int64_t tmp;
-    memcpy(&tmp, &result, sizeof(Element));
-    evaluator.thread_intermediate_storage[device_data_reference.data_index] = tmp;
+  /**
+   * @brief Evaluate an expression applied to a row.
+   *
+   * This function performs an n-ary transform for one row on one thread.
+   *
+   * @tparam OutputType The container type that data will be inserted into.
+   *
+   * @param output_object The container that data will be inserted into.
+   * @param row_index Row index of all input and output data column(s).
+   */
+  template <typename OutputType>
+  __device__ void evaluate(OutputType& output_object, cudf::size_type const row_index)
+  {
+    evaluate(output_object, row_index, row_index, row_index);
   }
-}
 
-/**
- * @brief Evaluate an expression applied to a row.
- *
- * This function performs an n-ary transform for one row on one thread.
- *
- * @param evaluator The row evaluator used for evaluation.
- * @param data_references Array of data references.
- * @param operators Array of operators to perform.
- * @param operator_source_indices Array of source indices for the operators.
- * @param num_operators Number of operators.
- * @param row_index Row index of data column(s).
- */
-__device__ void evaluate_row_expression(
-  detail::row_evaluator const& evaluator,
-  device_span<const detail::device_data_reference> data_references,
-  device_span<const ast_operator> operators,
-  device_span<const cudf::size_type> operator_source_indices,
-  cudf::size_type row_index)
-{
-  auto operator_source_index = static_cast<cudf::size_type>(0);
-  for (cudf::size_type operator_index = 0; operator_index < operators.size(); operator_index++) {
-    // Execute operator
-    auto const op    = operators[operator_index];
-    auto const arity = ast_operator_arity(op);
-    if (arity == 1) {
-      // Unary operator
-      auto const input  = data_references[operator_source_indices[operator_source_index]];
-      auto const output = data_references[operator_source_indices[operator_source_index + 1]];
-      operator_source_index += arity + 1;
-      type_dispatcher(input.data_type, evaluator, row_index, input, output, op);
-    } else if (arity == 2) {
-      // Binary operator
-      auto const lhs    = data_references[operator_source_indices[operator_source_index]];
-      auto const rhs    = data_references[operator_source_indices[operator_source_index + 1]];
-      auto const output = data_references[operator_source_indices[operator_source_index + 2]];
-      operator_source_index += arity + 1;
-      type_dispatcher(lhs.data_type,
-                      detail::single_dispatch_binary_operator{},
-                      evaluator,
-                      row_index,
-                      lhs,
-                      rhs,
-                      output,
-                      op);
-    } else {
-      cudf_assert(false && "Invalid operator arity.");
+  /**
+   * @brief Evaluate an expression applied to a row.
+   *
+   * This function performs an n-ary transform for one row on one thread.
+   *
+   * @tparam OutputType The container type that data will be inserted into.
+   *
+   * @param output_object The container that data will be inserted into.
+   * @param left_row_index The row to pull the data from the left table.
+   * @param right_row_index The row to pull the data from the right table.
+   * @param output_row_index The row in the output to insert the result.
+   */
+  template <typename OutputType>
+  __device__ void evaluate(OutputType& output_object,
+                           cudf::size_type const left_row_index,
+                           cudf::size_type const right_row_index,
+                           cudf::size_type const output_row_index)
+  {
+    auto operator_source_index = static_cast<cudf::size_type>(0);
+    for (cudf::size_type operator_index = 0; operator_index < plan.operators.size();
+         operator_index++) {
+      // Execute operator
+      auto const op    = plan.operators[operator_index];
+      auto const arity = ast_operator_arity(op);
+      if (arity == 1) {
+        // Unary operator
+        auto const input =
+          plan.data_references[plan.operator_source_indices[operator_source_index]];
+        auto const output =
+          plan.data_references[plan.operator_source_indices[operator_source_index + 1]];
+        operator_source_index += arity + 1;
+        auto input_row_index =
+          input.table_source == table_reference::LEFT ? left_row_index : right_row_index;
+        type_dispatcher(input.data_type,
+                        *this,
+                        output_object,
+                        input_row_index,
+                        input,
+                        output,
+                        output_row_index,
+                        op);
+      } else if (arity == 2) {
+        // Binary operator
+        auto const lhs = plan.data_references[plan.operator_source_indices[operator_source_index]];
+        auto const rhs =
+          plan.data_references[plan.operator_source_indices[operator_source_index + 1]];
+        auto const output =
+          plan.data_references[plan.operator_source_indices[operator_source_index + 2]];
+        operator_source_index += arity + 1;
+        type_dispatcher(lhs.data_type,
+                        detail::single_dispatch_binary_operator{},
+                        *this,
+                        output_object,
+                        left_row_index,
+                        right_row_index,
+                        lhs,
+                        rhs,
+                        output,
+                        output_row_index,
+                        op);
+      } else {
+        cudf_assert(false && "Invalid operator arity.");
+      }
     }
   }
-}
 
-/**
- * @brief The AST plan creates a device buffer of data needed to execute an AST.
- *
- * On construction, an AST plan creates a single "packed" host buffer of all necessary data arrays,
- * and copies that to the device with a single host-device memory copy. Because the plan tends to be
- * small, this is the most efficient approach for low latency.
- *
- */
-struct ast_plan {
-  ast_plan(linearizer const& expr_linearizer,
-           rmm::cuda_stream_view stream,
-           rmm::mr::device_memory_resource* mr)
-    : _sizes{}, _data_pointers{}
-  {
-    add_to_plan(expr_linearizer.data_references());
-    add_to_plan(expr_linearizer.literals());
-    add_to_plan(expr_linearizer.operators());
-    add_to_plan(expr_linearizer.operator_source_indices());
+ private:
+  /**
+   * @brief Helper struct for type dispatch on the result of an expression.
+   *
+   * Evaluating an expression requires multiple levels of type dispatch to
+   * determine the input types, the operation type, and the output type. This
+   * helper class is a functor that handles the operator dispatch, invokes the
+   * operator, and dispatches output writing based on the resulting data type.
+   */
+  struct expression_output_handler {
+   public:
+    __device__ expression_output_handler(expression_evaluator<has_nulls> const& evaluator)
+      : evaluator(evaluator)
+    {
+    }
 
-    // Create device buffer
-    auto const buffer_size = std::accumulate(_sizes.cbegin(), _sizes.cend(), 0);
-    auto buffer_offsets    = std::vector<int>(_sizes.size());
-    thrust::exclusive_scan(_sizes.cbegin(), _sizes.cend(), buffer_offsets.begin(), 0);
+    /**
+     * @brief Resolves an output data reference and assigns result value.
+     *
+     * Only output columns (COLUMN) and intermediates (INTERMEDIATE) are supported as output
+     * reference types. Intermediates must be of fixed width less than or equal to
+     * sizeof(std::int64_t). This requirement on intermediates is enforced by the linearizer.
+     *
+     * @tparam Element Type of result element.
+     * @tparam OutputType The container type that data will be inserted into.
+     *
+     * @param output_object The container that data will be inserted into.
+     * @param device_data_reference Data reference to resolve.
+     * @param row_index Row index of data column.
+     * @param result Value to assign to output.
+     */
+    template <typename Element,
+              typename OutputType,
+              CUDF_ENABLE_IF(is_rep_layout_compatible<Element>())>
+    __device__ void resolve_output(OutputType& output_object,
+                                   const detail::device_data_reference device_data_reference,
+                                   const cudf::size_type row_index,
+                                   const possibly_null_value_t<Element, has_nulls> result) const
+    {
+      auto const ref_type = device_data_reference.reference_type;
+      if (ref_type == detail::device_data_reference_type::COLUMN) {
+        output_object.template set_value<Element>(row_index, result);
+      } else {  // Assumes ref_type == detail::device_data_reference_type::INTERMEDIATE
+        // Using memcpy instead of reinterpret_cast<Element*> for safe type aliasing.
+        // Using a temporary variable ensures that the compiler knows the result is aligned.
+        IntermediateDataType<has_nulls> tmp;
+        memcpy(&tmp, &result, sizeof(possibly_null_value_t<Element, has_nulls>));
+        evaluator.thread_intermediate_storage[device_data_reference.data_index] = tmp;
+      }
+    }
 
-    auto h_data_buffer = std::make_unique<char[]>(buffer_size);
-    for (unsigned int i = 0; i < _data_pointers.size(); ++i) {
-      std::memcpy(h_data_buffer.get() + buffer_offsets[i], _data_pointers[i], _sizes[i]);
+    template <typename Element,
+              typename OutputType,
+              CUDF_ENABLE_IF(not is_rep_layout_compatible<Element>())>
+    __device__ void resolve_output(OutputType& output_object,
+                                   const detail::device_data_reference device_data_reference,
+                                   const cudf::size_type row_index,
+                                   const possibly_null_value_t<Element, has_nulls> result) const
+    {
+      cudf_assert(false && "Invalid type in resolve_output.");
     }
 
-    _device_data_buffer = rmm::device_buffer(h_data_buffer.get(), buffer_size, stream, mr);
+   protected:
+    expression_evaluator<has_nulls> const& evaluator;
+  };
 
-    stream.synchronize();
+  /**
+   * @brief Subclass of the expression output handler for unary operations.
+   *
+   * This functor's call operator is specialized to handle unary operations,
+   * which only require a single operand.
+   */
+  template <typename Input>
+  struct unary_expression_output_handler : public expression_output_handler {
+    __device__ unary_expression_output_handler(expression_evaluator<has_nulls> const& evaluator)
+      : expression_output_handler(evaluator)
+    {
+    }
 
-    // Create device pointers to components of plan
-    auto device_data_buffer_ptr = static_cast<const char*>(_device_data_buffer.data());
-    _device_data_references     = device_span<const detail::device_data_reference>(
-      reinterpret_cast<const detail::device_data_reference*>(device_data_buffer_ptr +
-                                                             buffer_offsets[0]),
-      expr_linearizer.data_references().size());
-    _device_literals = device_span<const cudf::detail::fixed_width_scalar_device_view_base>(
-      reinterpret_cast<const cudf::detail::fixed_width_scalar_device_view_base*>(
-        device_data_buffer_ptr + buffer_offsets[1]),
-      expr_linearizer.literals().size());
-    _device_operators = device_span<const ast_operator>(
-      reinterpret_cast<const ast_operator*>(device_data_buffer_ptr + buffer_offsets[2]),
-      expr_linearizer.operators().size());
-    _device_operator_source_indices = device_span<const cudf::size_type>(
-      reinterpret_cast<const cudf::size_type*>(device_data_buffer_ptr + buffer_offsets[3]),
-      expr_linearizer.operator_source_indices().size());
-  }
+    /**
+     * @brief Callable to perform a unary operation.
+     *
+     * @tparam op The operation to perform.
+     * @tparam OutputType The container type that data will be inserted into.
+     *
+     * @param output_object The container that data will be inserted into.
+     * @param outputrow_index The row in the output object to insert the data.
+     * @param input Input to the operation.
+     * @param output Output data reference.
+     */
+    template <
+      ast_operator op,
+      typename OutputType,
+      std::enable_if_t<detail::is_valid_unary_op<detail::operator_functor<op>, Input>>* = nullptr>
+    __device__ void operator()(OutputType& output_object,
+                               const cudf::size_type output_row_index,
+                               const possibly_null_value_t<Input, has_nulls> input,
+                               const detail::device_data_reference output) const
+    {
+      using OperatorFunctor = detail::operator_functor<op>;
+      using Out             = cuda::std::invoke_result_t<OperatorFunctor, Input>;
+      if constexpr (has_nulls) {
+        auto const result = input.has_value()
+                              ? possibly_null_value_t<Out, has_nulls>(OperatorFunctor{}(*input))
+                              : possibly_null_value_t<Out, has_nulls>();
+        this->template resolve_output<Out>(output_object, output, output_row_index, result);
+      } else {
+        this->template resolve_output<Out>(
+          output_object, output, output_row_index, OperatorFunctor{}(input));
+      }
+    }
+
+    template <
+      ast_operator op,
+      typename OutputType,
+      std::enable_if_t<!detail::is_valid_unary_op<detail::operator_functor<op>, Input>>* = nullptr>
+    __device__ void operator()(OutputType& output_object,
+                               const cudf::size_type output_row_index,
+                               const possibly_null_value_t<Input, has_nulls> input,
+                               const detail::device_data_reference output) const
+    {
+      cudf_assert(false && "Invalid unary dispatch operator for the provided input.");
+    }
+  };
 
   /**
-   * @brief Helper function for adding components (operators, literals, etc) to AST plan
+   * @brief Subclass of the expression output handler for binary operations.
    *
-   * @tparam T  The underlying type of the input `std::vector`
-   * @param  v  The `std::vector` containing components (operators, literals, etc)
+   * This functor's call operator is specialized to handle binary operations,
+   * which require two operands.
    */
-  template <typename T>
-  void add_to_plan(std::vector<T> const& v)
-  {
-    auto const data_size = sizeof(T) * v.size();
-    _sizes.push_back(data_size);
-    _data_pointers.push_back(v.data());
-  }
+  template <typename LHS, typename RHS>
+  struct binary_expression_output_handler : public expression_output_handler {
+    __device__ binary_expression_output_handler(expression_evaluator<has_nulls> const& evaluator)
+      : expression_output_handler(evaluator)
+    {
+    }
 
-  std::vector<cudf::size_type> _sizes;
-  std::vector<const void*> _data_pointers;
+    /**
+     * @brief Callable to perform a binary operation.
+     *
+     * @tparam op The operation to perform.
+     * @tparam OutputType The container type that data will be inserted into.
+     *
+     * @param output_object The container that data will be inserted into.
+     * @param output_row_index The row in the output to insert the result.
+     * @param lhs Left input to the operation.
+     * @param rhs Right input to the operation.
+     * @param output Output data reference.
+     */
+    template <ast_operator op,
+              typename OutputType,
+              std::enable_if_t<
+                detail::is_valid_binary_op<detail::operator_functor<op>, LHS, RHS>>* = nullptr>
+    __device__ void operator()(OutputType& output_object,
+                               const cudf::size_type output_row_index,
+                               const possibly_null_value_t<LHS, has_nulls> lhs,
+                               const possibly_null_value_t<RHS, has_nulls> rhs,
+                               const detail::device_data_reference output) const
+    {
+      using OperatorFunctor = detail::operator_functor<op>;
+      using Out             = cuda::std::invoke_result_t<OperatorFunctor, LHS, RHS>;
+      if constexpr (has_nulls) {
+        if constexpr (op == ast_operator::EQUAL) {
+          // Special handling of the equality operator based on what kind
+          // of null handling was requested.
+          possibly_null_value_t<Out, has_nulls> result;
+          if (!lhs.has_value() && !rhs.has_value()) {
+            // Case 1: Both null, so the output is based on compare_nulls.
+            result = possibly_null_value_t<Out, has_nulls>(this->evaluator.compare_nulls ==
+                                                           null_equality::EQUAL);
+          } else if (lhs.has_value() && rhs.has_value()) {
+            // Case 2: Neither is null, so the output is given by the operation.
+            result = possibly_null_value_t<Out, has_nulls>(OperatorFunctor{}(*lhs, *rhs));
+          } else {
+            // Case 3: One value is null, while the other is not, so we simply propagate nulls.
+            result = possibly_null_value_t<Out, has_nulls>();
+          }
+          this->template resolve_output<Out>(output_object, output, output_row_index, result);
+        } else {
+          // Default behavior for all other operators is to propagate nulls.
+          auto result = (lhs.has_value() && rhs.has_value())
+                          ? possibly_null_value_t<Out, has_nulls>(OperatorFunctor{}(*lhs, *rhs))
+                          : possibly_null_value_t<Out, has_nulls>();
+          this->template resolve_output<Out>(output_object, output, output_row_index, result);
+        }
+      } else {
+        this->template resolve_output<Out>(
+          output_object, output, output_row_index, OperatorFunctor{}(lhs, rhs));
+      }
+    }
 
-  rmm::device_buffer _device_data_buffer;
-  device_span<const detail::device_data_reference> _device_data_references;
-  device_span<const cudf::detail::fixed_width_scalar_device_view_base> _device_literals;
-  device_span<const ast_operator> _device_operators;
-  device_span<const cudf::size_type> _device_operator_source_indices;
+    template <ast_operator op,
+              typename OutputType,
+              std::enable_if_t<
+                !detail::is_valid_binary_op<detail::operator_functor<op>, LHS, RHS>>* = nullptr>
+    __device__ void operator()(OutputType& output_object,
+                               const cudf::size_type output_row_index,
+                               const possibly_null_value_t<LHS, has_nulls> lhs,
+                               const possibly_null_value_t<RHS, has_nulls> rhs,
+                               const detail::device_data_reference output) const
+    {
+      cudf_assert(false && "Invalid binary dispatch operator for the provided input.");
+    }
+  };
+
+  table_device_view const& left;   ///< The left table to operate on.
+  table_device_view const& right;  ///< The right table to operate on.
+  device_ast_plan const&
+    plan;  ///< The container of device data representing the expression to evaluate.
+  IntermediateDataType<has_nulls>*
+    thread_intermediate_storage;  ///< The shared memory store of intermediates produced during
+                                  ///< evaluation.
+  null_equality
+    compare_nulls;  ///< Whether the equality operator returns true or false for two nulls.
 };
 
 /**
diff --git a/cpp/include/cudf/binaryop.hpp b/cpp/include/cudf/binaryop.hpp
index 7099c29b9df..e6ff6b0eadc 100644
--- a/cpp/include/cudf/binaryop.hpp
+++ b/cpp/include/cudf/binaryop.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -42,37 +42,36 @@ enum class binary_operator : int32_t {
   FLOOR_DIV,             ///< operator / after promoting to 64 bit floating point and then
                          ///< flooring the result
   MOD,                   ///< operator %
+  PMOD,                  ///< positive modulo operator
+                         ///< If remainder is negative, this returns (remainder + divisor) % divisor
+                         ///< else, it returns (dividend % divisor)
   PYMOD,                 ///< operator % but following python's sign rules for negatives
   POW,                   ///< lhs ^ rhs
+  LOG_BASE,              ///< logarithm to the base
+  ATAN2,                 ///< 2-argument arctangent
+  SHIFT_LEFT,            ///< operator <<
+  SHIFT_RIGHT,           ///< operator >>
+  SHIFT_RIGHT_UNSIGNED,  ///< operator >>> (from Java)
+                         ///< Logical right shift. Casts to an unsigned value before shifting.
+  BITWISE_AND,           ///< operator &
+  BITWISE_OR,            ///< operator |
+  BITWISE_XOR,           ///< operator ^
+  LOGICAL_AND,           ///< operator &&
+  LOGICAL_OR,            ///< operator ||
   EQUAL,                 ///< operator ==
   NOT_EQUAL,             ///< operator !=
   LESS,                  ///< operator <
   GREATER,               ///< operator >
   LESS_EQUAL,            ///< operator <=
   GREATER_EQUAL,         ///< operator >=
-  BITWISE_AND,           ///< operator &
-  BITWISE_OR,            ///< operator |
-  BITWISE_XOR,           ///< operator ^
-  LOGICAL_AND,           ///< operator &&
-  LOGICAL_OR,            ///< operator ||
-  COALESCE,              ///< operator x,y  x is null ? y : x
-  GENERIC_BINARY,        ///< generic binary operator to be generated with input
-                         ///< ptx code
-  SHIFT_LEFT,            ///< operator <<
-  SHIFT_RIGHT,           ///< operator >>
-  SHIFT_RIGHT_UNSIGNED,  ///< operator >>> (from Java)
-                         ///< Logical right shift. Casts to an unsigned value before shifting.
-  LOG_BASE,              ///< logarithm to the base
-  ATAN2,                 ///< 2-argument arctangent
-  PMOD,                  ///< positive modulo operator
-                         ///< If remainder is negative, this returns (remainder + divisor) % divisor
-                         ///< else, it returns (dividend % divisor)
   NULL_EQUALS,           ///< Returns true when both operands are null; false when one is null; the
                          ///< result of equality when both are non-null
   NULL_MAX,              ///< Returns max of operands when both are non-null; returns the non-null
                          ///< operand when one is null; or invalid when both are null
   NULL_MIN,              ///< Returns min of operands when both are non-null; returns the non-null
                          ///< operand when one is null; or invalid when both are null
+  GENERIC_BINARY,        ///< generic binary operator to be generated with input
+                         ///< ptx code
   INVALID_BINARY         ///< invalid operation
 };
 /**
@@ -87,6 +86,7 @@ enum class binary_operator : int32_t {
  *
  * @param lhs         The left operand scalar
  * @param rhs         The right operand column
+ * @param op          The binary operator
  * @param output_type The desired data type of the output column
  * @param mr          Device memory resource used to allocate the returned column's device memory
  * @return            Output column of `output_type` type containing the result of
@@ -112,6 +112,7 @@ std::unique_ptr<column> binary_operation(
  *
  * @param lhs         The left operand column
  * @param rhs         The right operand scalar
+ * @param op          The binary operator
  * @param output_type The desired data type of the output column
  * @param mr          Device memory resource used to allocate the returned column's device memory
  * @return            Output column of `output_type` type containing the result of
@@ -135,6 +136,7 @@ std::unique_ptr<column> binary_operation(
  *
  * @param lhs         The left operand column
  * @param rhs         The right operand column
+ * @param op          The binary operator
  * @param output_type The desired data type of the output column
  * @param mr          Device memory resource used to allocate the returned column's device memory
  * @return            Output column of `output_type` type containing the result of
@@ -202,5 +204,89 @@ cudf::data_type binary_operation_fixed_point_output_type(binary_operator op,
                                                          cudf::data_type const& lhs,
                                                          cudf::data_type const& rhs);
 
+namespace experimental {
+/**
+ * @brief Performs a binary operation between a scalar and a column.
+ *
+ * The output contains the result of `op(lhs, rhs[i])` for all `0 <= i < rhs.size()`
+ * The scalar is the left operand and the column elements are the right operand.
+ * This distinction is significant in case of non-commutative binary operations
+ *
+ * Regardless of the operator, the validity of the output value is the logical
+ * AND of the validity of the two operands except NullMin and NullMax (logical OR).
+ *
+ * @param lhs         The left operand scalar
+ * @param rhs         The right operand column
+ * @param op          The binary operator
+ * @param output_type The desired data type of the output column
+ * @param mr          Device memory resource used to allocate the returned column's device memory
+ * @return            Output column of `output_type` type containing the result of
+ *                    the binary operation
+ * @throw cudf::logic_error if @p output_type dtype isn't fixed-width
+ * @throw cudf::logic_error if @p output_type dtype isn't boolean for comparison and logical
+ * operations.
+ */
+std::unique_ptr<column> binary_operation(
+  scalar const& lhs,
+  column_view const& rhs,
+  binary_operator op,
+  data_type output_type,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Performs a binary operation between a column and a scalar.
+ *
+ * The output contains the result of `op(lhs[i], rhs)` for all `0 <= i < lhs.size()`
+ * The column elements are the left operand and the scalar is the right operand.
+ * This distinction is significant in case of non-commutative binary operations
+ *
+ * Regardless of the operator, the validity of the output value is the logical
+ * AND of the validity of the two operands except NullMin and NullMax (logical OR).
+ *
+ * @param lhs         The left operand column
+ * @param rhs         The right operand scalar
+ * @param op          The binary operator
+ * @param output_type The desired data type of the output column
+ * @param mr          Device memory resource used to allocate the returned column's device memory
+ * @return            Output column of `output_type` type containing the result of
+ *                    the binary operation
+ * @throw cudf::logic_error if @p output_type dtype isn't fixed-width
+ * @throw cudf::logic_error if @p output_type dtype isn't boolean for comparison and logical
+ * operations.
+ */
+std::unique_ptr<column> binary_operation(
+  column_view const& lhs,
+  scalar const& rhs,
+  binary_operator op,
+  data_type output_type,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Performs a binary operation between two columns.
+ *
+ * The output contains the result of `op(lhs[i], rhs[i])` for all `0 <= i < lhs.size()`
+ *
+ * Regardless of the operator, the validity of the output value is the logical
+ * AND of the validity of the two operands except NullMin and NullMax (logical OR).
+ *
+ * @param lhs         The left operand column
+ * @param rhs         The right operand column
+ * @param op          The binary operator
+ * @param output_type The desired data type of the output column
+ * @param mr          Device memory resource used to allocate the returned column's device memory
+ * @return            Output column of `output_type` type containing the result of
+ *                    the binary operation
+ * @throw cudf::logic_error if @p lhs and @p rhs are different sizes
+ * @throw cudf::logic_error if @p output_type dtype isn't boolean for comparison and logical
+ * operations.
+ * @throw cudf::logic_error if @p output_type dtype isn't fixed-width
+ */
+std::unique_ptr<column> binary_operation(
+  column_view const& lhs,
+  column_view const& rhs,
+  binary_operator op,
+  data_type output_type,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+}  // namespace experimental
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/column/column.hpp b/cpp/include/cudf/column/column.hpp
index ee367840644..8decce7f260 100644
--- a/cpp/include/cudf/column/column.hpp
+++ b/cpp/include/cudf/column/column.hpp
@@ -293,7 +293,7 @@ class column {
   /**
    * @brief Implicit conversion operator to a `mutable_column_view`.
    *
-   * This allows pasing a `column` object into a function that accepts a
+   * This allows passing a `column` object into a function that accepts a
    *`mutable_column_view`. The conversion is automatic.
 
    * @note Creating a mutable view of a `column` invalidates the `column`'s
diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh
index 8cb05ca0bad..02e3eee6b43 100644
--- a/cpp/include/cudf/column/column_device_view.cuh
+++ b/cpp/include/cudf/column/column_device_view.cuh
@@ -37,7 +37,7 @@
 
 /**
  * @file column_device_view.cuh
- * @brief Column device view class definitons
+ * @brief Column device view class definitions
  */
 
 namespace cudf {
@@ -541,7 +541,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
    *
    * optional_begin with mode `DYNAMIC` defers the assumption of nullability to
    * runtime, with the user stating on construction of the iterator if column has nulls.
-   * `DYNAMIC` mode is nice when an algorithm is going to execute on mutliple
+   * `DYNAMIC` mode is nice when an algorithm is going to execute on multiple
    * iterators and you don't want to compile all the combinations of iterator types
    *
    * Example:
diff --git a/cpp/include/cudf/column/column_factories.hpp b/cpp/include/cudf/column/column_factories.hpp
index e5424f0fc44..bdb7fd48e60 100644
--- a/cpp/include/cudf/column/column_factories.hpp
+++ b/cpp/include/cudf/column/column_factories.hpp
@@ -399,7 +399,7 @@ std::unique_ptr<column> make_strings_column(
  *  one more than the total number of strings so the `offsets.back()` is the total number of bytes
  *  in the strings array. `offsets.front()` must always be 0 to point to the beginning of `strings`.
  * @param[in] null_mask Device span containing the null element indicator bitmask. Arrow format for
- *  nulls is used for interpeting this bitmask.
+ *  nulls is used for interpreting this bitmask.
  * @param[in] null_count The number of null string entries. If equal to `UNKNOWN_NULL_COUNT`, the
  *  null count will be computed dynamically on the first invocation of `column::null_count()`
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
@@ -428,7 +428,7 @@ std::unique_ptr<column> make_strings_column(
  *  strings are identified by the offsets and the nullmask.
  * @param[in] null_count The number of null string entries.
  * @param[in] null_mask The bits specifying the null strings in device memory. Arrow format for
- *  nulls is used for interpeting this bitmask.
+ *  nulls is used for interpreting this bitmask.
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  * @param[in] mr Device memory resource used for allocation of the column's `null_mask` and children
  * columns' device memory.
@@ -491,7 +491,7 @@ std::unique_ptr<column> make_strings_column(
  *                     further nested.
  * @param[in] null_count The number of null list entries.
  * @param[in] null_mask The bits specifying the null lists in device memory.
- *                  Arrow format for nulls is used for interpeting this bitmask.
+ *                  Arrow format for nulls is used for interpreting this bitmask.
  * @param[in] stream Optional stream for use with all memory allocation
  *               and device kernels
  * @param[in] mr Optional resource to use for device memory
diff --git a/cpp/include/cudf/column/column_view.hpp b/cpp/include/cudf/column/column_view.hpp
index 82326a21d7d..7ab8cc0f6b1 100644
--- a/cpp/include/cudf/column/column_view.hpp
+++ b/cpp/include/cudf/column/column_view.hpp
@@ -22,7 +22,7 @@
 
 /**
  * @file column_view.hpp
- * @brief column view class definitons
+ * @brief column view class definitions
  */
 
 namespace cudf {
diff --git a/cpp/include/cudf/copying.hpp b/cpp/include/cudf/copying.hpp
index 477c53535de..6ab115196d6 100644
--- a/cpp/include/cudf/copying.hpp
+++ b/cpp/include/cudf/copying.hpp
@@ -529,6 +529,7 @@ struct packed_columns {
    * @ingroup copy_split
    */
   struct metadata {
+    metadata() = default;
     metadata(std::vector<uint8_t>&& v) : data_(std::move(v)) {}
     uint8_t const* data() const { return data_.data(); }
     size_t size() const { return data_.size(); }
@@ -537,6 +538,15 @@ struct packed_columns {
     std::vector<uint8_t> data_;
   };
 
+  packed_columns()
+    : metadata_(std::make_unique<metadata>()), gpu_data(std::make_unique<rmm::device_buffer>())
+  {
+  }
+  packed_columns(std::unique_ptr<metadata>&& md, std::unique_ptr<rmm::device_buffer>&& gd)
+    : metadata_(std::move(md)), gpu_data(std::move(gd))
+  {
+  }
+
   std::unique_ptr<metadata> metadata_;
   std::unique_ptr<rmm::device_buffer> gpu_data;
 };
@@ -629,7 +639,7 @@ packed_columns pack(cudf::table_view const& input,
  * guaranteeing that that all of the columns in the table point into `contiguous_buffer`.
  *
  * @param input View of the table to pack
- * @param contgiuous_buffer A contiguous buffer of device memory which contains the data referenced
+ * @param contiguous_buffer A contiguous buffer of device memory which contains the data referenced
  * by the columns in `table`
  * @param buffer_size The size of `contiguous_buffer`.
  * @return Vector of bytes representing the metadata used to `unpack` a packed_columns struct.
diff --git a/cpp/include/cudf/datetime.hpp b/cpp/include/cudf/datetime.hpp
index a276769c169..980c824fdf2 100644
--- a/cpp/include/cudf/datetime.hpp
+++ b/cpp/include/cudf/datetime.hpp
@@ -189,6 +189,23 @@ std::unique_ptr<cudf::column> add_calendrical_months(
   cudf::column_view const& timestamps,
   cudf::column_view const& months,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief  Check if the year of the given date is a leap year
+ *
+ * `output[i] == true` if year of `column[i]` is a leap year
+ * `output[i] == false` if year of `column[i]` is not a leap year
+ * `output[i] is null` if `column[i]` is null
+ *
+ * @param[in] cudf::column_view of the input datetime values
+ *
+ * @returns cudf::column of datatype BOOL8 truth value of the corresponding date
+ * @throw cudf::logic_error if input column datatype is not a TIMESTAMP
+ */
+std::unique_ptr<cudf::column> is_leap_year(
+  cudf::column_view const& column,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /** @} */  // end of group
 }  // namespace datetime
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/aggregation/aggregation.cuh b/cpp/include/cudf/detail/aggregation/aggregation.cuh
index 09763d66403..53c1f47c201 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.cuh
+++ b/cpp/include/cudf/detail/aggregation/aggregation.cuh
@@ -643,7 +643,7 @@ struct identity_initializer {
  * The `i`th column will be initialized with the identity value of the `i`th
  * aggregation operation in `aggs`.
  *
- * @throw cudf::logic_error if column type and corresponging agg are incompatible
+ * @throw cudf::logic_error if column type and corresponding agg are incompatible
  * @throw cudf::logic_error if column type is not fixed-width
  *
  * @param table The table of columns to initialize.
diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp
index 373d695a5b5..10d9d8c1b92 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.hpp
+++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp
@@ -53,6 +53,8 @@ class simple_aggregations_collector {  // Declares the interface for the simple
     data_type col_type, class sum_of_squares_aggregation const& agg);
   virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
                                                           class mean_aggregation const& agg);
+  virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
+                                                          class m2_aggregation const& agg);
   virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
                                                           class var_aggregation const& agg);
   virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
@@ -75,14 +77,16 @@ class simple_aggregations_collector {  // Declares the interface for the simple
     data_type col_type, class collect_list_aggregation const& agg);
   virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
                                                           class collect_set_aggregation const& agg);
+  virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
+                                                          class lead_lag_aggregation const& agg);
+  virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
+                                                          class udf_aggregation const& agg);
   virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
                                                           class merge_lists_aggregation const& agg);
   virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
                                                           class merge_sets_aggregation const& agg);
   virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
-                                                          class lead_lag_aggregation const& agg);
-  virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
-                                                          class udf_aggregation const& agg);
+                                                          class merge_m2_aggregation const& agg);
 };
 
 class aggregation_finalizer {  // Declares the interface for the finalizer
@@ -98,6 +102,7 @@ class aggregation_finalizer {  // Declares the interface for the finalizer
   virtual void visit(class all_aggregation const& agg);
   virtual void visit(class sum_of_squares_aggregation const& agg);
   virtual void visit(class mean_aggregation const& agg);
+  virtual void visit(class m2_aggregation const& agg);
   virtual void visit(class var_aggregation const& agg);
   virtual void visit(class std_aggregation const& agg);
   virtual void visit(class median_aggregation const& agg);
@@ -109,10 +114,11 @@ class aggregation_finalizer {  // Declares the interface for the finalizer
   virtual void visit(class row_number_aggregation const& agg);
   virtual void visit(class collect_list_aggregation const& agg);
   virtual void visit(class collect_set_aggregation const& agg);
-  virtual void visit(class merge_lists_aggregation const& agg);
-  virtual void visit(class merge_sets_aggregation const& agg);
   virtual void visit(class lead_lag_aggregation const& agg);
   virtual void visit(class udf_aggregation const& agg);
+  virtual void visit(class merge_lists_aggregation const& agg);
+  virtual void visit(class merge_sets_aggregation const& agg);
+  virtual void visit(class merge_m2_aggregation const& agg);
 };
 
 /**
@@ -286,6 +292,25 @@ class mean_aggregation final : public rolling_aggregation {
   void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
 };
 
+/**
+ * @brief Derived class for specifying a m2 aggregation
+ */
+class m2_aggregation : public aggregation {
+ public:
+  m2_aggregation() : aggregation{M2} {}
+
+  std::unique_ptr<aggregation> clone() const override
+  {
+    return std::make_unique<m2_aggregation>(*this);
+  }
+  std::vector<std::unique_ptr<aggregation>> get_simple_aggregations(
+    data_type col_type, cudf::detail::simple_aggregations_collector& collector) const override
+  {
+    return collector.visit(col_type, *this);
+  }
+  void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
+};
+
 /**
  * @brief Derived class for specifying a standard deviation/variance aggregation
  */
@@ -633,66 +658,6 @@ class collect_set_aggregation final : public rolling_aggregation {
   }
 };
 
-/**
- * @brief Derived aggregation class for specifying MERGE_LISTs aggregation
- */
-class merge_lists_aggregation final : public aggregation {
- public:
-  explicit merge_lists_aggregation() : aggregation{MERGE_LISTS} {}
-
-  std::unique_ptr<aggregation> clone() const override
-  {
-    return std::make_unique<merge_lists_aggregation>(*this);
-  }
-  std::vector<std::unique_ptr<aggregation>> get_simple_aggregations(
-    data_type col_type, cudf::detail::simple_aggregations_collector& collector) const override
-  {
-    return collector.visit(col_type, *this);
-  }
-  void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
-};
-
-/**
- * @brief Derived aggregation class for specifying MERGE_SETs aggregation
- */
-class merge_sets_aggregation final : public aggregation {
- public:
-  explicit merge_sets_aggregation(null_equality nulls_equal, nan_equality nans_equal)
-    : aggregation{MERGE_SETS}, _nulls_equal(nulls_equal), _nans_equal(nans_equal)
-  {
-  }
-
-  null_equality _nulls_equal;  ///< whether to consider nulls as equal value
-  nan_equality _nans_equal;    ///< whether to consider NaNs as equal value (applicable only to
-                               ///< floating point types)
-
-  bool is_equal(aggregation const& _other) const override
-  {
-    if (!this->aggregation::is_equal(_other)) { return false; }
-    auto const& other = dynamic_cast<merge_sets_aggregation const&>(_other);
-    return (_nulls_equal == other._nulls_equal && _nans_equal == other._nans_equal);
-  }
-
-  size_t do_hash() const override { return this->aggregation::do_hash() ^ hash_impl(); }
-
-  std::unique_ptr<aggregation> clone() const override
-  {
-    return std::make_unique<merge_sets_aggregation>(*this);
-  }
-  std::vector<std::unique_ptr<aggregation>> get_simple_aggregations(
-    data_type col_type, cudf::detail::simple_aggregations_collector& collector) const override
-  {
-    return collector.visit(col_type, *this);
-  }
-  void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
-
- protected:
-  size_t hash_impl() const
-  {
-    return std::hash<int>{}(static_cast<int>(_nulls_equal) ^ static_cast<int>(_nans_equal));
-  }
-};
-
 /**
  * @brief Derived aggregation class for specifying LEAD/LAG window aggregations
  */
@@ -783,6 +748,85 @@ class udf_aggregation final : public rolling_aggregation {
   }
 };
 
+/**
+ * @brief Derived aggregation class for specifying MERGE_LISTS aggregation
+ */
+class merge_lists_aggregation final : public aggregation {
+ public:
+  explicit merge_lists_aggregation() : aggregation{MERGE_LISTS} {}
+
+  std::unique_ptr<aggregation> clone() const override
+  {
+    return std::make_unique<merge_lists_aggregation>(*this);
+  }
+  std::vector<std::unique_ptr<aggregation>> get_simple_aggregations(
+    data_type col_type, cudf::detail::simple_aggregations_collector& collector) const override
+  {
+    return collector.visit(col_type, *this);
+  }
+  void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
+};
+
+/**
+ * @brief Derived aggregation class for specifying MERGE_SETS aggregation
+ */
+class merge_sets_aggregation final : public aggregation {
+ public:
+  explicit merge_sets_aggregation(null_equality nulls_equal, nan_equality nans_equal)
+    : aggregation{MERGE_SETS}, _nulls_equal(nulls_equal), _nans_equal(nans_equal)
+  {
+  }
+
+  null_equality _nulls_equal;  ///< whether to consider nulls as equal value
+  nan_equality _nans_equal;    ///< whether to consider NaNs as equal value (applicable only to
+                               ///< floating point types)
+
+  bool is_equal(aggregation const& _other) const override
+  {
+    if (!this->aggregation::is_equal(_other)) { return false; }
+    auto const& other = dynamic_cast<merge_sets_aggregation const&>(_other);
+    return (_nulls_equal == other._nulls_equal && _nans_equal == other._nans_equal);
+  }
+
+  size_t do_hash() const override { return this->aggregation::do_hash() ^ hash_impl(); }
+
+  std::unique_ptr<aggregation> clone() const override
+  {
+    return std::make_unique<merge_sets_aggregation>(*this);
+  }
+  std::vector<std::unique_ptr<aggregation>> get_simple_aggregations(
+    data_type col_type, cudf::detail::simple_aggregations_collector& collector) const override
+  {
+    return collector.visit(col_type, *this);
+  }
+  void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
+
+ protected:
+  size_t hash_impl() const
+  {
+    return std::hash<int>{}(static_cast<int>(_nulls_equal) ^ static_cast<int>(_nans_equal));
+  }
+};
+
+/**
+ * @brief Derived aggregation class for specifying MERGE_M2 aggregation
+ */
+class merge_m2_aggregation final : public aggregation {
+ public:
+  explicit merge_m2_aggregation() : aggregation{MERGE_M2} {}
+
+  std::unique_ptr<aggregation> clone() const override
+  {
+    return std::make_unique<merge_m2_aggregation>(*this);
+  }
+  std::vector<std::unique_ptr<aggregation>> get_simple_aggregations(
+    data_type col_type, cudf::detail::simple_aggregations_collector& collector) const override
+  {
+    return collector.visit(col_type, *this);
+  }
+  void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
+};
+
 /**
  * @brief Sentinel value used for `ARGMAX` aggregation.
  *
@@ -904,6 +948,12 @@ struct target_type_impl<Source, k, std::enable_if_t<is_chrono<Source>() && is_su
   using type = Source;
 };
 
+// Always use `double` for M2
+template <typename SourceType>
+struct target_type_impl<SourceType, aggregation::M2> {
+  using type = double;
+};
+
 // Always use `double` for VARIANCE
 template <typename SourceType>
 struct target_type_impl<SourceType, aggregation::VARIANCE> {
@@ -970,6 +1020,18 @@ struct target_type_impl<Source, aggregation::COLLECT_SET> {
   using type = cudf::list_view;
 };
 
+// Always use Source for LEAD
+template <typename Source>
+struct target_type_impl<Source, aggregation::LEAD> {
+  using type = Source;
+};
+
+// Always use Source for LAG
+template <typename Source>
+struct target_type_impl<Source, aggregation::LAG> {
+  using type = Source;
+};
+
 // Always use list for MERGE_LISTS
 template <typename Source>
 struct target_type_impl<Source, aggregation::MERGE_LISTS> {
@@ -982,16 +1044,10 @@ struct target_type_impl<Source, aggregation::MERGE_SETS> {
   using type = cudf::list_view;
 };
 
-// Always use Source for LEAD
-template <typename Source>
-struct target_type_impl<Source, aggregation::LEAD> {
-  using type = Source;
-};
-
-// Always use Source for LAG
-template <typename Source>
-struct target_type_impl<Source, aggregation::LAG> {
-  using type = Source;
+// Always use struct for MERGE_M2
+template <typename SourceType>
+struct target_type_impl<SourceType, aggregation::MERGE_M2> {
+  using type = cudf::struct_view;
 };
 
 /**
@@ -1061,6 +1117,7 @@ CUDA_HOST_DEVICE_CALLABLE decltype(auto) aggregation_dispatcher(aggregation::Kin
       return f.template operator()<aggregation::SUM_OF_SQUARES>(std::forward<Ts>(args)...);
     case aggregation::MEAN:
       return f.template operator()<aggregation::MEAN>(std::forward<Ts>(args)...);
+    case aggregation::M2: return f.template operator()<aggregation::M2>(std::forward<Ts>(args)...);
     case aggregation::VARIANCE:
       return f.template operator()<aggregation::VARIANCE>(std::forward<Ts>(args)...);
     case aggregation::STD:
@@ -1083,14 +1140,16 @@ CUDA_HOST_DEVICE_CALLABLE decltype(auto) aggregation_dispatcher(aggregation::Kin
       return f.template operator()<aggregation::COLLECT_LIST>(std::forward<Ts>(args)...);
     case aggregation::COLLECT_SET:
       return f.template operator()<aggregation::COLLECT_SET>(std::forward<Ts>(args)...);
-    case aggregation::MERGE_LISTS:
-      return f.template operator()<aggregation::MERGE_LISTS>(std::forward<Ts>(args)...);
-    case aggregation::MERGE_SETS:
-      return f.template operator()<aggregation::MERGE_SETS>(std::forward<Ts>(args)...);
     case aggregation::LEAD:
       return f.template operator()<aggregation::LEAD>(std::forward<Ts>(args)...);
     case aggregation::LAG:
       return f.template operator()<aggregation::LAG>(std::forward<Ts>(args)...);
+    case aggregation::MERGE_LISTS:
+      return f.template operator()<aggregation::MERGE_LISTS>(std::forward<Ts>(args)...);
+    case aggregation::MERGE_SETS:
+      return f.template operator()<aggregation::MERGE_SETS>(std::forward<Ts>(args)...);
+    case aggregation::MERGE_M2:
+      return f.template operator()<aggregation::MERGE_M2>(std::forward<Ts>(args)...);
     default: {
 #ifndef __CUDA_ARCH__
       CUDF_FAIL("Unsupported aggregation.");
diff --git a/cpp/include/cudf/detail/copy.hpp b/cpp/include/cudf/detail/copy.hpp
index aebf0c23469..79da4a997da 100644
--- a/cpp/include/cudf/detail/copy.hpp
+++ b/cpp/include/cudf/detail/copy.hpp
@@ -232,9 +232,10 @@ std::unique_ptr<table> sample(
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<scalar> get_element(column_view const& input,
-                                    size_type index,
-                                    rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr);
+std::unique_ptr<scalar> get_element(
+  column_view const& input,
+  size_type index,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/copy_if_else.cuh b/cpp/include/cudf/detail/copy_if_else.cuh
index 1acdcadaacf..74a94f34ad8 100644
--- a/cpp/include/cudf/detail/copy_if_else.cuh
+++ b/cpp/include/cudf/detail/copy_if_else.cuh
@@ -46,7 +46,7 @@ __launch_bounds__(block_size) __global__
                            RightIter rhs,
                            Filter filter,
                            mutable_column_device_view out,
-                           size_type *__restrict__ const valid_count)
+                           size_type* __restrict__ const valid_count)
 {
   const size_type tid            = threadIdx.x + blockIdx.x * block_size;
   const int warp_id              = tid / warp_size;
@@ -166,7 +166,7 @@ std::unique_ptr<column> copy_if_else(
   FilterFn filter,
   cudf::data_type output_type,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource())
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   using Element =
     typename thrust::tuple_element<0, typename thrust::iterator_traits<LeftIter>::value_type>::type;
diff --git a/cpp/include/cudf/detail/datetime.hpp b/cpp/include/cudf/detail/datetime.hpp
index 017fe0d96ff..9cc319b5011 100644
--- a/cpp/include/cudf/detail/datetime.hpp
+++ b/cpp/include/cudf/detail/datetime.hpp
@@ -124,6 +124,17 @@ std::unique_ptr<cudf::column> add_calendrical_months(
   cudf::column_view const& months,
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @copydoc cudf::is_leap_year(cudf::column_view const&, rmm::mr::device_memory_resource *)
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
+std::unique_ptr<cudf::column> is_leap_year(
+  cudf::column_view const& column,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 }  // namespace detail
 }  // namespace datetime
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/indexalator.cuh b/cpp/include/cudf/detail/indexalator.cuh
index 8bbd0d1aada..4a2b40e8be7 100644
--- a/cpp/include/cudf/detail/indexalator.cuh
+++ b/cpp/include/cudf/detail/indexalator.cuh
@@ -29,7 +29,7 @@ namespace detail {
 /**
  * @brief The base class for the input or output index normalizing iterator.
  *
- * This implementation uses CTRP to define the `input_indexalator` and the
+ * This implementation uses CRTP to define the `input_indexalator` and the
  * `output_indexalator` classes. This is so this class can manipulate the
  * uniquely typed subclass member variable `p_` directly without requiring
  * virtual functions since iterator instances will be copied to device memory.
@@ -241,7 +241,7 @@ struct base_indexalator {
  */
 struct input_indexalator : base_indexalator<input_indexalator> {
   friend struct indexalator_factory;
-  friend struct base_indexalator<input_indexalator>;  // for CTRP
+  friend struct base_indexalator<input_indexalator>;  // for CRTP
 
   using reference = size_type const;  // this keeps STL and thrust happy
 
@@ -326,7 +326,7 @@ struct input_indexalator : base_indexalator<input_indexalator> {
  */
 struct output_indexalator : base_indexalator<output_indexalator> {
   friend struct indexalator_factory;
-  friend struct base_indexalator<output_indexalator>;  // for CTRP
+  friend struct base_indexalator<output_indexalator>;  // for CRTP
 
   using reference = output_indexalator const&;  // required for output iterators
 
diff --git a/cpp/include/cudf/detail/iterator.cuh b/cpp/include/cudf/detail/iterator.cuh
index 4cb0c6e1877..deb161fd9c2 100644
--- a/cpp/include/cudf/detail/iterator.cuh
+++ b/cpp/include/cudf/detail/iterator.cuh
@@ -177,7 +177,7 @@ auto make_null_replacement_iterator(column_device_view const& column,
  *
  * make_optional_iterator with mode `DYNAMIC` defers the assumption of nullability to
  * runtime, with the user stating on construction of the iterator if column has nulls.
- * `DYNAMIC` mode is nice when an algorithm is going to execute on mutliple
+ * `DYNAMIC` mode is nice when an algorithm is going to execute on multiple
  * iterators and you don't want to compile all the combinations of iterator types
  *
  * Example:
@@ -819,7 +819,7 @@ auto inline make_pair_iterator(scalar const& scalar_value)
  *
  * Else, if the scalar is null, then the value of `p.first` is undefined and `p.second == false`.
  *
- * The behaviour is undefined if the scalar is destroyed before iterator dereferencing.
+ * The behavior is undefined if the scalar is destroyed before iterator dereferencing.
  *
  * @throws cudf::logic_error if scalar datatype and Element type mismatch.
  * @throws cudf::logic_error if the returned iterator is dereferenced in host
diff --git a/cpp/include/cudf/detail/merge.cuh b/cpp/include/cudf/detail/merge.cuh
index a938a3a053a..a779c3defbb 100644
--- a/cpp/include/cudf/detail/merge.cuh
+++ b/cpp/include/cudf/detail/merge.cuh
@@ -77,8 +77,8 @@ struct tagged_element_relational_comparator {
   {
   }
 
-  __device__ weak_ordering compare(index_type lhs_tagged_index, index_type rhs_tagged_index) const
-    noexcept
+  __device__ weak_ordering compare(index_type lhs_tagged_index,
+                                   index_type rhs_tagged_index) const noexcept
   {
     side const l_side = thrust::get<0>(lhs_tagged_index);
     side const r_side = thrust::get<0>(rhs_tagged_index);
@@ -117,8 +117,8 @@ struct row_lexicographic_tagged_comparator {
     CUDF_EXPECTS(_lhs.num_columns() == _rhs.num_columns(), "Mismatched number of columns.");
   }
 
-  __device__ bool operator()(index_type lhs_tagged_index, index_type rhs_tagged_index) const
-    noexcept
+  __device__ bool operator()(index_type lhs_tagged_index,
+                             index_type rhs_tagged_index) const noexcept
   {
     for (size_type i = 0; i < _lhs.num_columns(); ++i) {
       bool ascending = (_column_order == nullptr) or (_column_order[i] == order::ASCENDING);
diff --git a/cpp/include/cudf/detail/null_mask.cuh b/cpp/include/cudf/detail/null_mask.cuh
index 08dae998944..e507bacb919 100644
--- a/cpp/include/cudf/detail/null_mask.cuh
+++ b/cpp/include/cudf/detail/null_mask.cuh
@@ -38,7 +38,7 @@ namespace detail {
 template <typename Binop>
 __global__ void offset_bitmask_binop(Binop op,
                                      device_span<bitmask_type> destination,
-                                     device_span<bitmask_type const *> source,
+                                     device_span<bitmask_type const*> source,
                                      device_span<size_type const> source_begin_bits,
                                      size_type source_size_bits)
 {
@@ -73,16 +73,16 @@ __global__ void offset_bitmask_binop(Binop op,
 template <typename Binop>
 rmm::device_buffer bitmask_binop(
   Binop op,
-  host_span<bitmask_type const *> masks,
+  host_span<bitmask_type const*> masks,
   host_span<size_type const> masks_begin_bits,
   size_type mask_size_bits,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource())
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   auto dest_mask = rmm::device_buffer{bitmask_allocation_size_bytes(mask_size_bits), stream, mr};
 
   inplace_bitmask_binop(op,
-                        device_span<bitmask_type>(static_cast<bitmask_type *>(dest_mask.data()),
+                        device_span<bitmask_type>(static_cast<bitmask_type*>(dest_mask.data()),
                                                   num_bitmask_words(mask_size_bits)),
                         masks,
                         masks_begin_bits,
@@ -110,11 +110,11 @@ template <typename Binop>
 void inplace_bitmask_binop(
   Binop op,
   device_span<bitmask_type> dest_mask,
-  host_span<bitmask_type const *> masks,
+  host_span<bitmask_type const*> masks,
   host_span<size_type const> masks_begin_bits,
   size_type mask_size_bits,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource())
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   CUDF_EXPECTS(
     std::all_of(masks_begin_bits.begin(), masks_begin_bits.end(), [](auto b) { return b >= 0; }),
@@ -123,7 +123,7 @@ void inplace_bitmask_binop(
   CUDF_EXPECTS(std::all_of(masks.begin(), masks.end(), [](auto p) { return p != nullptr; }),
                "Mask pointer cannot be null");
 
-  rmm::device_uvector<bitmask_type const *> d_masks(masks.size(), stream, mr);
+  rmm::device_uvector<bitmask_type const*> d_masks(masks.size(), stream, mr);
   rmm::device_uvector<size_type> d_begin_bits(masks_begin_bits.size(), stream, mr);
 
   CUDA_TRY(cudaMemcpyAsync(
diff --git a/cpp/include/cudf/detail/null_mask.hpp b/cpp/include/cudf/detail/null_mask.hpp
index 77cb321a12c..f757929d839 100644
--- a/cpp/include/cudf/detail/null_mask.hpp
+++ b/cpp/include/cudf/detail/null_mask.hpp
@@ -34,25 +34,45 @@ rmm::device_buffer create_null_mask(
   size_type size,
   mask_state state,
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
-  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @copydoc cudf::set_null_mask(bitmask_type*, size_type, size_type, bool)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-void set_null_mask(bitmask_type *bitmask,
+void set_null_mask(bitmask_type* bitmask,
                    size_type begin_bit,
                    size_type end_bit,
                    bool valid,
                    rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
+/**
+ * @copydoc cudf::count_set_bits
+ *
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches.
+ */
+cudf::size_type count_set_bits(bitmask_type const* bitmask,
+                               size_type start,
+                               size_type stop,
+                               rmm::cuda_stream_view stream);
+
+/**
+ * @copydoc cudf::count_unset_bits
+ *
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches.
+ */
+cudf::size_type count_unset_bits(bitmask_type const* bitmask,
+                                 size_type start,
+                                 size_type stop,
+                                 rmm::cuda_stream_view stream);
+
 /**
  * @copydoc cudf::segmented_count_set_bits
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
-std::vector<size_type> segmented_count_set_bits(bitmask_type const *bitmask,
+std::vector<size_type> segmented_count_set_bits(bitmask_type const* bitmask,
                                                 host_span<size_type const> indices,
                                                 rmm::cuda_stream_view stream);
 
@@ -61,7 +81,7 @@ std::vector<size_type> segmented_count_set_bits(bitmask_type const *bitmask,
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
-std::vector<size_type> segmented_count_unset_bits(bitmask_type const *bitmask,
+std::vector<size_type> segmented_count_unset_bits(bitmask_type const* bitmask,
                                                   host_span<size_type const> indices,
                                                   rmm::cuda_stream_view stream);
 
@@ -72,11 +92,11 @@ std::vector<size_type> segmented_count_unset_bits(bitmask_type const *bitmask,
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 rmm::device_buffer copy_bitmask(
-  bitmask_type const *mask,
+  bitmask_type const* mask,
   size_type begin_bit,
   size_type end_bit,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @copydoc cudf::copy_bitmask(column_view const& view, rmm::mr::device_memory_resource*)
@@ -84,9 +104,9 @@ rmm::device_buffer copy_bitmask(
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 rmm::device_buffer copy_bitmask(
-  column_view const &view,
+  column_view const& view,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @copydoc bitmask_and(host_span<bitmask_type const *> const, host_span<size_type> const,
@@ -95,11 +115,11 @@ rmm::device_buffer copy_bitmask(
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
 rmm::device_buffer bitmask_and(
-  host_span<bitmask_type const *> masks,
+  host_span<bitmask_type const*> masks,
   host_span<size_type const> masks_begin_bits,
   size_type mask_size_bits,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @copydoc cudf::bitmask_and
@@ -107,9 +127,9 @@ rmm::device_buffer bitmask_and(
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
 rmm::device_buffer bitmask_and(
-  table_view const &view,
+  table_view const& view,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @copydoc cudf::bitmask_or
@@ -117,9 +137,9 @@ rmm::device_buffer bitmask_and(
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
 rmm::device_buffer bitmask_or(
-  table_view const &view,
+  table_view const& view,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Performs a bitwise AND of the specified bitmasks,
@@ -135,11 +155,11 @@ rmm::device_buffer bitmask_or(
  */
 void inplace_bitmask_and(
   device_span<bitmask_type> dest_mask,
-  host_span<bitmask_type const *> masks,
+  host_span<bitmask_type const*> masks,
   host_span<size_type const> masks_begin_bits,
   size_type mask_size_bits,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 }  // namespace detail
 
diff --git a/cpp/include/cudf/detail/nvtx/nvtx3.hpp b/cpp/include/cudf/detail/nvtx/nvtx3.hpp
index add5699e34a..0e1a82a0657 100644
--- a/cpp/include/cudf/detail/nvtx/nvtx3.hpp
+++ b/cpp/include/cudf/detail/nvtx/nvtx3.hpp
@@ -54,7 +54,7 @@
  * \code{.cpp}
  * #include "nvtx3.hpp"
  * void some_function(){
- *    // Begins a NVTX range with the messsage "some_function"
+ *    // Begins a NVTX range with the message "some_function"
  *    // The range ends when some_function() returns and `r` is destroyed
  *    nvtx3::thread_range r{"some_function"};
  *
@@ -322,7 +322,7 @@
  * Example:
  * \code{.cpp}
  * // Create an `event_attributes` with the custom message "my message"
- * nvtx3::event_attributes attr{nvtx3::Mesage{"my message"}};
+ * nvtx3::event_attributes attr{nvtx3::message{"my message"}};
  *
  * // strings and string literals implicitly assumed to be a `nvtx3::message`
  * nvtx3::event_attributes attr{"my message"};
@@ -1267,7 +1267,7 @@ class registered_message {
  * nvtx3::thread_range range1{attr1};
  *
  * // `range2` contains message "message 2"
- * nvtx3::thread_range range2{nvtx3::Mesage{"message 2"}};
+ * nvtx3::thread_range range2{nvtx3::message{"message 2"}};
  *
  * // `std::string` and string literals are implicitly assumed to be
  * // the contents of an `nvtx3::message`
@@ -1525,7 +1525,7 @@ class payload {
  *
  * // For convenience, the arguments that can be passed to the
  * `event_attributes`
- * // constructor may be passed to the `domain_thread_range` contructor where
+ * // constructor may be passed to the `domain_thread_range` constructor where
  * // they will be forwarded to the `EventAttribute`s constructor
  * nvtx3::thread_range r{nvtx3::payload{42}, nvtx3::category{1}, "message"};
  * \endcode
diff --git a/cpp/include/cudf/detail/unary.hpp b/cpp/include/cudf/detail/unary.hpp
index 0615e502c60..e672cf01488 100644
--- a/cpp/include/cudf/detail/unary.hpp
+++ b/cpp/include/cudf/detail/unary.hpp
@@ -31,7 +31,7 @@ namespace detail {
  * doesn't.
  *
  * @tparam InputIterator Iterator type for `begin` and `end`
- * @tparam Predicate A predicator type which will be evaludated
+ * @tparam Predicate A predicator type which will be evaluated
  * @param begin Beginning of the sequence of elements
  * @param end End of the sequence of elements
  * @param p Predicate to be applied to each element in `[begin,end)`
diff --git a/cpp/include/cudf/detail/utilities/device_atomics.cuh b/cpp/include/cudf/detail/utilities/device_atomics.cuh
index 16b7da0a083..6380e76fdfa 100644
--- a/cpp/include/cudf/detail/utilities/device_atomics.cuh
+++ b/cpp/include/cudf/detail/utilities/device_atomics.cuh
@@ -95,12 +95,12 @@ struct genericAtomicOperationImpl<T, Op, 2> {
 
     do {
       assumed                 = old;
-      T target_value          = (is_32_align) ? T(old & 0xffff) : T(old >> 16);
+      T const target_value    = (is_32_align) ? T(old & 0xffff) : T(old >> 16);
       uint16_t updating_value = type_reinterpret<uint16_t, T>(op(target_value, update_value));
 
-      T_int new_value = (is_32_align) ? (old & 0xffff0000) | updating_value
-                                      : (old & 0xffff) | (T_int(updating_value) << 16);
-      old = atomicCAS(address_uint32, assumed, new_value);
+      T_int const new_value = (is_32_align) ? (old & 0xffff0000) | updating_value
+                                            : (old & 0xffff) | (T_int(updating_value) << 16);
+      old                   = atomicCAS(address_uint32, assumed, new_value);
     } while (assumed != old);
 
     return (is_32_align) ? T(old & 0xffff) : T(old >> 16);
@@ -161,7 +161,7 @@ struct genericAtomicOperationImpl<T, Op, 8> {
 
 // -----------------------------------------------------------------------
 // specialized functions for operators
-// `atomicAdd` supports int32, float, double (signed int64 is not supproted.)
+// `atomicAdd` supports int32, float, double (signed int64 is not supported.)
 // `atomicMin`, `atomicMax` support int32_t, int64_t
 // `atomicAnd`, `atomicOr`, `atomicXor` support int32_t, int64_t
 template <>
diff --git a/cpp/include/cudf/detail/utilities/transform_unary_functions.cuh b/cpp/include/cudf/detail/utilities/transform_unary_functions.cuh
index 8c0abbad49f..05a788abd45 100644
--- a/cpp/include/cudf/detail/utilities/transform_unary_functions.cuh
+++ b/cpp/include/cudf/detail/utilities/transform_unary_functions.cuh
@@ -50,7 +50,7 @@ struct null_replacing_transformer {
   }
 
   template <typename ElementType>
-  CUDA_HOST_DEVICE_CALLABLE type operator()(thrust::pair<ElementType, bool> const &pair_value)
+  CUDA_HOST_DEVICE_CALLABLE type operator()(thrust::pair<ElementType, bool> const& pair_value)
   {
     if (pair_value.second)
       return f(pair_value.first);
@@ -83,7 +83,7 @@ struct meanvar {
   using this_t = cudf::meanvar<ElementType>;
 
   CUDA_HOST_DEVICE_CALLABLE
-  this_t operator+(this_t const &rhs) const
+  this_t operator+(this_t const& rhs) const
   {
     return this_t((this->value + rhs.value),
                   (this->value_squared + rhs.value_squared),
@@ -91,7 +91,7 @@ struct meanvar {
   };
 
   CUDA_HOST_DEVICE_CALLABLE
-  bool operator==(this_t const &rhs) const
+  bool operator==(this_t const& rhs) const
   {
     return ((this->value == rhs.value) && (this->value_squared == rhs.value_squared) &&
             (this->count == rhs.count));
@@ -114,7 +114,7 @@ struct meanvar {
 template <typename ElementType>
 struct transformer_squared {
   CUDA_HOST_DEVICE_CALLABLE
-  ElementType operator()(ElementType const &value) { return (value * value); };
+  ElementType operator()(ElementType const& value) { return (value * value); };
 };
 
 /**
@@ -131,7 +131,7 @@ struct transformer_meanvar {
   using ResultType = meanvar<ElementType>;
 
   CUDA_HOST_DEVICE_CALLABLE
-  ResultType operator()(thrust::pair<ElementType, bool> const &pair)
+  ResultType operator()(thrust::pair<ElementType, bool> const& pair)
   {
     ElementType v = pair.first;
     return meanvar<ElementType>(v, v * v, (pair.second) ? 1 : 0);
diff --git a/cpp/include/cudf/detail/valid_if.cuh b/cpp/include/cudf/detail/valid_if.cuh
index 11ce9199c2d..4a7e9b89c80 100644
--- a/cpp/include/cudf/detail/valid_if.cuh
+++ b/cpp/include/cudf/detail/valid_if.cuh
@@ -117,7 +117,7 @@ std::pair<rmm::device_buffer, size_type> valid_if(
 *         input ranges.
 
  * Given a set of bitmasks, `masks`, the state of bit `j` in mask `i` is
- * determined by `p( *(begin1 + i), *(begin2 + j))`. If the predivate evaluates
+ * determined by `p( *(begin1 + i), *(begin2 + j))`. If the predicate evaluates
  * to true, the the bit is set to `1`. If false, set to `0`.
  *
  * Example Arguments:
diff --git a/cpp/include/cudf/groupby.hpp b/cpp/include/cudf/groupby.hpp
index 85c469f58f8..5656b38a0ef 100644
--- a/cpp/include/cudf/groupby.hpp
+++ b/cpp/include/cudf/groupby.hpp
@@ -116,7 +116,7 @@ class groupby {
   /**
    * @brief Performs grouped aggregations on the specified values.
    *
-   * The values to aggregate and the aggregations to perform are specifed in an
+   * The values to aggregate and the aggregations to perform are specified in an
    * `aggregation_request`. Each request contains a `column_view` of values to
    * aggregate and a set of `aggregation`s to perform on those elements.
    *
@@ -173,7 +173,7 @@ class groupby {
   /**
    * @brief Performs grouped scans on the specified values.
    *
-   * The values to aggregate and the aggregations to perform are specifed in an
+   * The values to aggregate and the aggregations to perform are specified in an
    * `aggregation_request`. Each request contains a `column_view` of values to
    * aggregate and a set of `aggregation`s to perform on those elements.
    *
diff --git a/cpp/include/cudf/io/avro.hpp b/cpp/include/cudf/io/avro.hpp
index 18398ff4ceb..34410209c72 100644
--- a/cpp/include/cudf/io/avro.hpp
+++ b/cpp/include/cudf/io/avro.hpp
@@ -180,7 +180,7 @@ class avro_reader_options_builder {
   /**
    * @brief move avro_reader_options member once it's built.
    */
-  operator avro_reader_options &&() { return std::move(options); }
+  operator avro_reader_options&&() { return std::move(options); }
 
   /**
    * @brief move avro_reader_options member once it's built.
diff --git a/cpp/include/cudf/io/csv.hpp b/cpp/include/cudf/io/csv.hpp
index 8efe871ad3a..1dff99735ec 100644
--- a/cpp/include/cudf/io/csv.hpp
+++ b/cpp/include/cudf/io/csv.hpp
@@ -574,9 +574,9 @@ class csv_reader_options {
    *
    * @param types Vector of dtypes in which the column needs to be read.
    */
-  [
-    [deprecated("The string-based interface will be deprecated."
-                "Use dtypes(std::vector<data_type>) instead.")]] void
+  [[deprecated(
+    "The string-based interface will be deprecated."
+    "Use dtypes(std::vector<data_type>) instead.")]] void
   set_dtypes(std::vector<std::string> types)
   {
     _dtypes = std::move(types);
@@ -997,9 +997,9 @@ class csv_reader_options_builder {
    * @param types Vector of dtypes in which the column needs to be read.
    * @return this for chaining.
    */
-  [
-    [deprecated("The string-based interface will be deprecated."
-                "Use dtypes(std::vector<data_type>) instead.")]] csv_reader_options_builder&
+  [[deprecated(
+    "The string-based interface will be deprecated."
+    "Use dtypes(std::vector<data_type>) instead.")]] csv_reader_options_builder&
   dtypes(std::vector<std::string> types)
   {
     options._dtypes = std::move(types);
@@ -1093,7 +1093,7 @@ class csv_reader_options_builder {
   /**
    * @brief move csv_reader_options member once it's built.
    */
-  operator csv_reader_options &&() { return std::move(options); }
+  operator csv_reader_options&&() { return std::move(options); }
 
   /**
    * @brief move csv_reader_options member once it's built.
@@ -1422,7 +1422,7 @@ class csv_writer_options_builder {
   /**
    * @brief move `csv_writer_options` member once it's built.
    */
-  operator csv_writer_options &&() { return std::move(options); }
+  operator csv_writer_options&&() { return std::move(options); }
 
   /**
    * @brief move `csv_writer_options` member once it's built.
diff --git a/cpp/include/cudf/io/datasource.hpp b/cpp/include/cudf/io/datasource.hpp
index 6c885a874ee..c1aff818121 100644
--- a/cpp/include/cudf/io/datasource.hpp
+++ b/cpp/include/cudf/io/datasource.hpp
@@ -322,9 +322,9 @@ class arrow_io_source : public datasource {
     filesystem = result.ValueOrDie();
 
     // Parse the path from the URI
-    size_t start = arrow_uri.find(uri_start_delimiter) == std::string::npos
-                     ? 0
-                     : arrow_uri.find(uri_start_delimiter) + uri_start_delimiter.size();
+    size_t start          = arrow_uri.find(uri_start_delimiter) == std::string::npos
+                              ? 0
+                              : arrow_uri.find(uri_start_delimiter) + uri_start_delimiter.size();
     size_t end            = arrow_uri.find(uri_end_delimiter) - start;
     std::string_view path = arrow_uri.substr(start, end);
 
diff --git a/cpp/include/cudf/io/detail/avro.hpp b/cpp/include/cudf/io/detail/avro.hpp
index 4310d0e7c4b..98483d1c03e 100644
--- a/cpp/include/cudf/io/detail/avro.hpp
+++ b/cpp/include/cudf/io/detail/avro.hpp
@@ -46,10 +46,10 @@ class reader {
    * @param stream CUDA stream used for device memory operations and kernel launches
    * @param mr Device memory resource to use for device memory allocation
    */
-  explicit reader(std::vector<std::string> const &filepaths,
-                  avro_reader_options const &options,
+  explicit reader(std::vector<std::string> const& filepaths,
+                  avro_reader_options const& options,
                   rmm::cuda_stream_view stream,
-                  rmm::mr::device_memory_resource *mr);
+                  rmm::mr::device_memory_resource* mr);
 
   /**
    * @brief Constructor from an array of datasources
@@ -59,10 +59,10 @@ class reader {
    * @param stream CUDA stream used for device memory operations and kernel launches
    * @param mr Device memory resource to use for device memory allocation
    */
-  explicit reader(std::vector<std::unique_ptr<cudf::io::datasource>> &&sources,
-                  avro_reader_options const &options,
+  explicit reader(std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
+                  avro_reader_options const& options,
                   rmm::cuda_stream_view stream,
-                  rmm::mr::device_memory_resource *mr);
+                  rmm::mr::device_memory_resource* mr);
 
   /**
    * @brief Destructor explicitly-declared to avoid inlined in header
@@ -77,7 +77,7 @@ class reader {
    *
    * @return The set of columns along with table metadata
    */
-  table_with_metadata read(avro_reader_options const &options,
+  table_with_metadata read(avro_reader_options const& options,
                            rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 };
 }  // namespace avro
diff --git a/cpp/include/cudf/io/detail/csv.hpp b/cpp/include/cudf/io/detail/csv.hpp
index 8ec2818c2ca..89e589d306a 100644
--- a/cpp/include/cudf/io/detail/csv.hpp
+++ b/cpp/include/cudf/io/detail/csv.hpp
@@ -41,10 +41,10 @@ class reader {
    * @param stream CUDA stream used for device memory operations and kernel launches
    * @param mr Device memory resource to use for device memory allocation
    */
-  explicit reader(std::vector<std::string> const &filepaths,
-                  csv_reader_options const &options,
+  explicit reader(std::vector<std::string> const& filepaths,
+                  csv_reader_options const& options,
                   rmm::cuda_stream_view stream,
-                  rmm::mr::device_memory_resource *mr);
+                  rmm::mr::device_memory_resource* mr);
 
   /**
    * @brief Constructor from an array of datasources
@@ -54,10 +54,10 @@ class reader {
    * @param stream CUDA stream used for device memory operations and kernel launches
    * @param mr Device memory resource to use for device memory allocation
    */
-  explicit reader(std::vector<std::unique_ptr<cudf::io::datasource>> &&sources,
-                  csv_reader_options const &options,
+  explicit reader(std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
+                  csv_reader_options const& options,
                   rmm::cuda_stream_view stream,
-                  rmm::mr::device_memory_resource *mr);
+                  rmm::mr::device_memory_resource* mr);
 
   /**
    * @brief Destructor explicitly-declared to avoid inlined in header
@@ -91,9 +91,9 @@ class writer {
    * @param mr Device memory resource to use for device memory allocation
    */
   writer(std::unique_ptr<cudf::io::data_sink> sinkp,
-         csv_writer_options const &options,
+         csv_writer_options const& options,
          rmm::cuda_stream_view stream,
-         rmm::mr::device_memory_resource *mr);  // cannot provide definition here (because
+         rmm::mr::device_memory_resource* mr);  // cannot provide definition here (because
                                                 // _impl is incomplete hence unique_ptr has
                                                 // not enough sizeof() info)
 
@@ -109,8 +109,8 @@ class writer {
    * @param metadata Table metadata and column names
    * @param stream CUDA stream used for device memory operations and kernel launches.
    */
-  void write(table_view const &table,
-             const table_metadata *metadata = nullptr,
+  void write(table_view const& table,
+             const table_metadata* metadata = nullptr,
              rmm::cuda_stream_view stream   = rmm::cuda_stream_default);
 };
 }  // namespace csv
diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp
index 6ed93dc5c25..e6d8f2de483 100644
--- a/cpp/include/cudf/io/detail/json.hpp
+++ b/cpp/include/cudf/io/detail/json.hpp
@@ -54,10 +54,10 @@ class reader {
    * @param stream CUDA stream used for device memory operations and kernel launches
    * @param mr Device memory resource to use for device memory allocation
    */
-  explicit reader(std::vector<std::string> const &filepaths,
-                  json_reader_options const &options,
+  explicit reader(std::vector<std::string> const& filepaths,
+                  json_reader_options const& options,
                   rmm::cuda_stream_view stream,
-                  rmm::mr::device_memory_resource *mr);
+                  rmm::mr::device_memory_resource* mr);
 
   /**
    * @brief Constructor from an array of datasources
@@ -67,10 +67,10 @@ class reader {
    * @param stream CUDA stream used for device memory operations and kernel launches
    * @param mr Device memory resource to use for device memory allocation
    */
-  explicit reader(std::vector<std::unique_ptr<cudf::io::datasource>> &&sources,
-                  json_reader_options const &options,
+  explicit reader(std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
+                  json_reader_options const& options,
                   rmm::cuda_stream_view stream,
-                  rmm::mr::device_memory_resource *mr);
+                  rmm::mr::device_memory_resource* mr);
 
   /**
    * @brief Destructor explicitly-declared to avoid inlined in header
@@ -83,7 +83,7 @@ class reader {
    * @param[in] options Settings for controlling reading behavior
    * @return cudf::table object that contains the array of cudf::column.
    */
-  table_with_metadata read(json_reader_options const &options,
+  table_with_metadata read(json_reader_options const& options,
                            rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 };
 
diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index 7d56c1c0fc6..2f4d0936d8b 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -277,7 +277,7 @@ class json_reader_options_builder {
   /**
    * @brief move json_reader_options member once it's built.
    */
-  operator json_reader_options &&() { return std::move(options); }
+  operator json_reader_options&&() { return std::move(options); }
 
   /**
    * @brief move json_reader_options member once it's built.
diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp
index bd1e4e96d7d..997f35ed922 100644
--- a/cpp/include/cudf/io/orc.hpp
+++ b/cpp/include/cudf/io/orc.hpp
@@ -331,7 +331,7 @@ class orc_reader_options_builder {
   /**
    * @brief move orc_reader_options member once it's built.
    */
-  operator orc_reader_options &&() { return std::move(options); }
+  operator orc_reader_options&&() { return std::move(options); }
 
   /**
    * @brief move orc_reader_options member once it's built.
@@ -550,7 +550,7 @@ class orc_writer_options_builder {
   /**
    * @brief move orc_writer_options member once it's built.
    */
-  operator orc_writer_options &&() { return std::move(options); }
+  operator orc_writer_options&&() { return std::move(options); }
 
   /**
    * @brief move orc_writer_options member once it's built.
@@ -724,7 +724,7 @@ class chunked_orc_writer_options_builder {
   /**
    * @brief move chunked_orc_writer_options member once it's built.
    */
-  operator chunked_orc_writer_options &&() { return std::move(options); }
+  operator chunked_orc_writer_options&&() { return std::move(options); }
 
   /**
    * @brief move chunked_orc_writer_options member once it's built.
diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp
index 178e46a0c5c..ecd9607a87e 100644
--- a/cpp/include/cudf/io/parquet.hpp
+++ b/cpp/include/cudf/io/parquet.hpp
@@ -339,7 +339,7 @@ class parquet_reader_options_builder {
   /**
    * @brief move parquet_reader_options member once it's built.
    */
-  operator parquet_reader_options &&() { return std::move(options); }
+  operator parquet_reader_options&&() { return std::move(options); }
 
   /**
    * @brief move parquet_reader_options member once it's built.
@@ -769,7 +769,7 @@ class parquet_writer_options_builder {
   /**
    * @brief move parquet_writer_options member once it's built.
    */
-  operator parquet_writer_options &&() { return std::move(options); }
+  operator parquet_writer_options&&() { return std::move(options); }
 
   /**
    * @brief move parquet_writer_options member once it's built.
@@ -973,7 +973,7 @@ class chunked_parquet_writer_options_builder {
    * @brief Set to true if timestamps should be written as
    * int96 types instead of int64 types. Even though int96 is deprecated and is
    * not an internal type for cudf, it needs to be written for backwards
-   * compatability reasons.
+   * compatibility reasons.
    *
    * @param enabled Boolean value to enable/disable int96 timestamps.
    * @return this for chaining.
@@ -987,7 +987,7 @@ class chunked_parquet_writer_options_builder {
   /**
    * @brief move chunked_parquet_writer_options member once it's built.
    */
-  operator chunked_parquet_writer_options &&() { return std::move(options); }
+  operator chunked_parquet_writer_options&&() { return std::move(options); }
 
   /**
    * @brief move chunked_parquet_writer_options member once it's is built.
diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp
index 1f9ed71ce8c..725c0fc3699 100644
--- a/cpp/include/cudf/join.hpp
+++ b/cpp/include/cudf/join.hpp
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <cudf/ast/nodes.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 
@@ -23,6 +24,7 @@
 #include <rmm/device_uvector.hpp>
 
 #include <optional>
+#include <utility>
 #include <vector>
 
 namespace cudf {
@@ -647,5 +649,206 @@ class hash_join {
   const std::unique_ptr<const hash_join_impl> impl;
 };
 
+/**
+ * @brief Returns a pair of row index vectors corresponding to all pairs
+ * of rows between the specified tables where the predicate evaluates to true.
+ *
+ * The first returned vector contains the row indices from the left
+ * table that have a match in the right table (in unspecified order).
+ * The corresponding values in the second returned vector are
+ * the matched row indices from the right table.
+ *
+ * @code{.pseudo}
+ * Left: {{0, 1, 2}}
+ * Right: {{1, 2, 3}}
+ * Expression: Left.Column_0 == Right.Column_0
+ * Result: {{1, 2}, {0, 1}}
+ *
+ * Left: {{0, 1, 2}, {3, 4, 5}}
+ * Right: {{1, 2, 3}, {4, 6, 7}}
+ * Expression: (Left.Column_0 == Right.Column_0) AND (Left.Column_1 == Right.Column_1)
+ * Result: {{1}, {0}}
+ * @endcode
+ *
+ * @throw cudf::logic_error if number of elements in `left_keys` or `right_keys`
+ * mismatch.
+ *
+ * @param left The left table
+ * @param right The right table
+ * @param binary_predicate The condition on which to join.
+ * @param compare_nulls Whether the equality operator returns true or false for two nulls.
+ * @param mr Device memory resource used to allocate the returned table and columns' device memory
+ *
+ * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct
+ * the result of performing a conditional inner join between two tables `left` and `right` .
+ */
+std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+          std::unique_ptr<rmm::device_uvector<size_type>>>
+conditional_inner_join(
+  table_view left,
+  table_view right,
+  ast::expression binary_predicate,
+  null_equality compare_nulls         = null_equality::EQUAL,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Returns a pair of row index vectors corresponding to all pairs
+ * of rows between the specified tables where the predicate evaluates to true,
+ * or null matches for rows in left that have no match in right.
+ *
+ * The first returned vector contains all the row indices from the left
+ * table (in unspecified order). The corresponding value in the
+ * second returned vector is either (1) the row index of the matched row
+ * from the right table, if there is a match  or  (2) an unspecified
+ * out-of-bounds value.
+ *
+ * @code{.pseudo}
+ * Left: {{0, 1, 2}}
+ * Right: {{1, 2, 3}}
+ * Expression: Left.Column_0 == Right.Column_0
+ * Result: {{0, 1, 2}, {None, 0, 1}}
+ *
+ * Left: {{0, 1, 2}, {3, 4, 5}}
+ * Right: {{1, 2, 3}, {4, 6, 7}}
+ * Expression: (Left.Column_0 == Right.Column_0) AND (Left.Column_1 == Right.Column_1)
+ * Result: {{0, 1, 2}, {None, 0, None}}
+ * @endcode
+ *
+ * @throw cudf::logic_error if number of elements in `left_keys` or `right_keys`
+ * mismatch.
+ *
+ * @param left The left table
+ * @param right The right table
+ * @param binary_predicate The condition on which to join.
+ * @param compare_nulls Whether the equality operator returns true or false for two nulls.
+ * @param mr Device memory resource used to allocate the returned table and columns' device memory
+ *
+ * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct
+ * the result of performing a conditional left join between two tables `left` and `right` .
+ */
+std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+          std::unique_ptr<rmm::device_uvector<size_type>>>
+conditional_left_join(table_view left,
+                      table_view right,
+                      ast::expression binary_predicate,
+                      null_equality compare_nulls         = null_equality::EQUAL,
+                      rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Returns a pair of row index vectors corresponding to all pairs
+ * of rows between the specified tables where the predicate evaluates to true,
+ * or null matches for rows in either table that have no match in the other.
+ *
+ * Taken pairwise, the values from the returned vectors are one of:
+ * (1) row indices corresponding to matching rows from the left and
+ * right tables, (2) a row index and an unspecified out-of-bounds value,
+ * representing a row from one table without a match in the other.
+ *
+ * @code{.pseudo}
+ * Left: {{0, 1, 2}}
+ * Right: {{1, 2, 3}}
+ * Expression: Left.Column_0 == Right.Column_0
+ * Result: {{0, 1, 2, None}, {None, 0, 1, 2}}
+ *
+ * Left: {{0, 1, 2}, {3, 4, 5}}
+ * Right: {{1, 2, 3}, {4, 6, 7}}
+ * Expression: (Left.Column_0 == Right.Column_0) AND (Left.Column_1 == Right.Column_1)
+ * Result: {{0, 1, 2, None, None}, {None, 0, None, 1, 2}}
+ * @endcode
+ *
+ * @throw cudf::logic_error if number of elements in `left_keys` or `right_keys`
+ * mismatch.
+ *
+ * @param left The left table
+ * @param right The right table
+ * @param binary_predicate The condition on which to join.
+ * @param compare_nulls Whether the equality operator returns true or false for two nulls.
+ * @param mr Device memory resource used to allocate the returned table and columns' device memory
+ *
+ * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct
+ * the result of performing a conditional full join between two tables `left` and `right` .
+ */
+std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+          std::unique_ptr<rmm::device_uvector<size_type>>>
+conditional_full_join(table_view left,
+                      table_view right,
+                      ast::expression binary_predicate,
+                      null_equality compare_nulls         = null_equality::EQUAL,
+                      rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Returns an index vector corresponding to all rows in the left table
+ * for which there exists some row in the right table where the predicate
+ * evaluates to true.
+ *
+ * @code{.pseudo}
+ * Left: {{0, 1, 2}}
+ * Right: {{1, 2, 3}}
+ * Expression: Left.Column_0 == Right.Column_0
+ * Result: {1, 2}
+ *
+ * Left: {{0, 1, 2}, {3, 4, 5}}
+ * Right: {{1, 2, 3}, {4, 6, 7}}
+ * Expression: (Left.Column_0 == Right.Column_0) AND (Left.Column_1 == Right.Column_1)
+ * Result: {1}
+ * @endcode
+ *
+ * @throw cudf::logic_error if number of elements in `left_keys` or `right_keys`
+ * mismatch.
+ *
+ * @param left The left table
+ * @param right The right table
+ * @param binary_predicate The condition on which to join.
+ * @param compare_nulls Whether the equality operator returns true or false for two nulls.
+ * @param mr Device memory resource used to allocate the returned table and columns' device memory
+ *
+ * @return A vector `left_indices` that can be used to construct the result of
+ * performing a conditional left semi join between two tables `left` and
+ * `right` .
+ */
+std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_semi_join(
+  table_view left,
+  table_view right,
+  ast::expression binary_predicate,
+  null_equality compare_nulls         = null_equality::EQUAL,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Returns an index vector corresponding to all rows in the left table
+ * for which there does not exist any row in the right table where the
+ * predicate evaluates to true.
+ *
+ * @code{.pseudo}
+ * Left: {{0, 1, 2}}
+ * Right: {{1, 2, 3}}
+ * Expression: Left.Column_0 == Right.Column_0
+ * Result: {0}
+ *
+ * Left: {{0, 1, 2}, {3, 4, 5}}
+ * Right: {{1, 2, 3}, {4, 6, 7}}
+ * Expression: (Left.Column_0 == Right.Column_0) AND (Left.Column_1 == Right.Column_1)
+ * Result: {0, 2}
+ * @endcode
+ *
+ * @throw cudf::logic_error if number of elements in `left_keys` or `right_keys`
+ * mismatch.
+ *
+ * @param left The left table
+ * @param right The right table
+ * @param binary_predicate The condition on which to join.
+ * @param compare_nulls Whether the equality operator returns true or false for two nulls.
+ * @param mr Device memory resource used to allocate the returned table and columns' device memory
+ *
+ * @return A vector `left_indices` that can be used to construct the result of
+ * performing a conditional left anti join between two tables `left` and
+ * `right` .
+ */
+std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_anti_join(
+  table_view left,
+  table_view right,
+  ast::expression binary_predicate,
+  null_equality compare_nulls         = null_equality::EQUAL,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/lists/detail/scatter.cuh b/cpp/include/cudf/lists/detail/scatter.cuh
index a440e456e25..94b0e830b15 100644
--- a/cpp/include/cudf/lists/detail/scatter.cuh
+++ b/cpp/include/cudf/lists/detail/scatter.cuh
@@ -25,6 +25,7 @@
 #include <cudf/null_mask.hpp>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/types.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
@@ -89,7 +90,7 @@ std::unique_ptr<column> scatter_impl(
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
-  assert_same_data_type(source, target);
+  CUDF_EXPECTS(column_types_equal(source, target), "Mismatched column types.");
 
   auto const child_column_type = lists_column_view(target).child().type();
 
diff --git a/cpp/include/cudf/lists/detail/scatter_helper.cuh b/cpp/include/cudf/lists/detail/scatter_helper.cuh
index 76121bc35e9..7d0586ed6a6 100644
--- a/cpp/include/cudf/lists/detail/scatter_helper.cuh
+++ b/cpp/include/cudf/lists/detail/scatter_helper.cuh
@@ -129,11 +129,6 @@ struct unbound_list_view {
   size_type _size{};       // Number of elements in *this* list row.
 };
 
-/**
- * @brief Checks that the specified columns have matching schemas, all the way down.
- */
-void assert_same_data_type(column_view const& lhs, column_view const& rhs);
-
 std::unique_ptr<column> build_lists_child_column_recursive(
   data_type child_column_type,
   rmm::device_uvector<unbound_list_view> const& list_vector,
diff --git a/cpp/include/cudf/reduction.hpp b/cpp/include/cudf/reduction.hpp
index 9fd913517fc..d094118293b 100644
--- a/cpp/include/cudf/reduction.hpp
+++ b/cpp/include/cudf/reduction.hpp
@@ -64,10 +64,10 @@ enum class scan_type : bool { INCLUSIVE, EXCLUSIVE };
  * @returns Output scalar with reduce result.
  */
 std::unique_ptr<scalar> reduce(
-  column_view const &col,
-  std::unique_ptr<aggregation> const &agg,
+  column_view const& col,
+  std::unique_ptr<aggregation> const& agg,
   data_type output_dtype,
-  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  Computes the scan of a column.
@@ -88,11 +88,11 @@ std::unique_ptr<scalar> reduce(
  * @returns unique pointer to new output column
  */
 std::unique_ptr<column> scan(
-  const column_view &input,
-  std::unique_ptr<aggregation> const &agg,
+  const column_view& input,
+  std::unique_ptr<aggregation> const& agg,
   scan_type inclusive,
   null_policy null_handling           = null_policy::EXCLUDE,
-  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Determines the minimum and maximum values of a column.
@@ -104,8 +104,8 @@ std::unique_ptr<column> scan(
  *         and the second scalar being the maximum value of the input column.
  */
 std::pair<std::unique_ptr<scalar>, std::unique_ptr<scalar>> minmax(
-  column_view const &col,
-  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
+  column_view const& col,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 
diff --git a/cpp/include/cudf/scalar/scalar.hpp b/cpp/include/cudf/scalar/scalar.hpp
index 2e57e56255d..0e14b0c6bf5 100644
--- a/cpp/include/cudf/scalar/scalar.hpp
+++ b/cpp/include/cudf/scalar/scalar.hpp
@@ -78,7 +78,7 @@ class scalar {
   /**
    * @brief Indicates whether the scalar contains a valid value.
    *
-   * @note Using the value when `is_valid() == false` is undefined behaviour. In addition, this
+   * @note Using the value when `is_valid() == false` is undefined behavior. In addition, this
    * function does a stream synchronization.
    *
    * @param stream CUDA stream used for device memory operations.
@@ -154,7 +154,7 @@ class fixed_width_scalar : public scalar {
   void set_value(T value, rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
   /**
-   * @brief Implicit conversion operator to get the value of the scalar on the host.
+   * @brief Explicit conversion operator to get the value of the scalar on the host.
    */
   explicit operator value_type() const;
 
@@ -365,6 +365,11 @@ class fixed_point_scalar : public scalar {
    */
   T fixed_point_value(rmm::cuda_stream_view stream = rmm::cuda_stream_default) const;
 
+  /**
+   * @brief Explicit conversion operator to get the value of the scalar on the host.
+   */
+  explicit operator value_type() const;
+
   /**
    * @brief Returns a raw pointer to the value in device memory.
    */
@@ -465,7 +470,7 @@ class string_scalar : public scalar {
                 rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
-   * @brief Implicit conversion operator to get the value of the scalar in a host std::string.
+   * @brief Explicit conversion operator to get the value of the scalar in a host std::string.
    */
   explicit operator std::string() const;
 
diff --git a/cpp/include/cudf/scalar/scalar_device_view.cuh b/cpp/include/cudf/scalar/scalar_device_view.cuh
index d56d5d5eb0d..884b412d3e2 100644
--- a/cpp/include/cudf/scalar/scalar_device_view.cuh
+++ b/cpp/include/cudf/scalar/scalar_device_view.cuh
@@ -21,7 +21,7 @@
 
 /**
  * @file scalar_device_view.cuh
- * @brief Scalar device view class definitons
+ * @brief Scalar device view class definitions
  */
 
 namespace cudf {
diff --git a/cpp/include/cudf/sorting.hpp b/cpp/include/cudf/sorting.hpp
index 2454cfe7c7b..36a8131a78e 100644
--- a/cpp/include/cudf/sorting.hpp
+++ b/cpp/include/cudf/sorting.hpp
@@ -187,7 +187,7 @@ std::unique_ptr<column> rank(
 /**
  * @brief Returns sorted order after sorting each segment in the table.
  *
- * If segment_offsets contains values larger than number of rows, behaviour is undefined.
+ * If segment_offsets contains values larger than number of rows, behavior is undefined.
  * @throws cudf::logic_error if `segment_offsets` is not `size_type` column.
  *
  * @param keys The table that determines the ordering of elements in each segment
@@ -214,7 +214,7 @@ std::unique_ptr<column> segmented_sorted_order(
 /**
  * @brief Performs a lexicographic segmented sort of a table
  *
- * If segment_offsets contains values larger than number of rows, behaviour is undefined.
+ * If segment_offsets contains values larger than number of rows, behavior is undefined.
  * @throws cudf::logic_error if `values.num_rows() != keys.num_rows()`.
  * @throws cudf::logic_error if `segment_offsets` is not `size_type` column.
  *
diff --git a/cpp/include/cudf/strings/capitalize.hpp b/cpp/include/cudf/strings/capitalize.hpp
index 372d9faf13f..604756b5d09 100644
--- a/cpp/include/cudf/strings/capitalize.hpp
+++ b/cpp/include/cudf/strings/capitalize.hpp
@@ -16,6 +16,7 @@
 #pragma once
 
 #include <cudf/column/column.hpp>
+#include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/char_types/char_types.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
@@ -30,21 +31,33 @@ namespace strings {
 /**
  * @brief Returns a column of capitalized strings.
  *
- * Any null string entries return corresponding null output column entries.
+ * If the `delimiters` is an empty string, then only the first character of each
+ * row is capitalized. Otherwise, a non-delimiter character is capitalized after
+ * any delimiter character is found.
  *
  * @code{.pseudo}
  * Example:
- * input = ["tesT1", "a Test", "Another Test"];
+ * input = ["tesT1", "a Test", "Another Test", "a\tb"];
  * output = capitalize(input)
- * output is ["Test1", "A test", "Another test"]
+ * output is ["Test1", "A test", "Another test", "A\tb"]
+ * output = capitalize(input, " ")
+ * output is ["Test1", "A Test", "Another Test", "A\tb"]
+ * output = capitalize(input, " \t")
+ * output is ["Test1", "A Test", "Another Test", "A\tB"]
  * @endcode
  *
- * @param[in] input String column.
- * @param[in] mr Device memory resource used to allocate the returned column's device memory
+ * Any null string entries return corresponding null output column entries.
+ *
+ * @throw cudf::logic_error if `delimiter.is_valid()` is  `false`.
+ *
+ * @param input String column.
+ * @param delimiters Characters for identifying words to capitalize.
+ * @param mr Device memory resource used to allocate the returned column's device memory
  * @return Column of strings capitalized from the input column.
  */
 std::unique_ptr<column> capitalize(
   strings_column_view const& input,
+  string_scalar const& delimiters     = string_scalar(""),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
diff --git a/cpp/include/cudf/strings/combine.hpp b/cpp/include/cudf/strings/combine.hpp
index 3e069de2f0f..32f8d482a34 100644
--- a/cpp/include/cudf/strings/combine.hpp
+++ b/cpp/include/cudf/strings/combine.hpp
@@ -272,7 +272,7 @@ std::unique_ptr<column> join_list_elements(
  * delimited by the @p separator provided.
  *
  * A null list row will always result in a null string in the output row. Any non-null list row
- * having a null elenent will result in the corresponding output row to be null unless a
+ * having a null element will result in the corresponding output row to be null unless a
  * @p narep string is specified to be used in its place.
  *
  * If @p separate_nulls is set to `NO` and @p narep is valid then separators are not added to the
diff --git a/cpp/include/cudf/strings/detail/copying.hpp b/cpp/include/cudf/strings/detail/copying.hpp
index 19dfa193207..6083ebc4a62 100644
--- a/cpp/include/cudf/strings/detail/copying.hpp
+++ b/cpp/include/cudf/strings/detail/copying.hpp
@@ -16,6 +16,7 @@
 #pragma once
 
 #include <cudf/column/column.hpp>
+#include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -54,6 +55,33 @@ std::unique_ptr<cudf::column> copy_slice(
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Returns a new strings column created by shifting the rows by a specified offset.
+ *
+ * @code{.pseudo}
+ * Example:
+ * s = ["a", "b", "c", "d", "e", "f"]
+ * r1 = shift(s, 2, "_")
+ * r1 is now ["_", "_", "a", "b", "c", "d"]
+ * r2 = shift(s, -2, "_")
+ * r2 is now ["c", "d", "e", "f", "_", "_"]
+ * @endcode
+ *
+ * The caller should set the validity mask in the output column.
+ *
+ * @param input Strings instance for this operation.
+ * @param offset The offset by which to shift the input.
+ * @param fill_value Fill value for indeterminable outputs.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ * @return New strings column.
+ */
+std::unique_ptr<column> shift(strings_column_view const& input,
+                              size_type offset,
+                              scalar const& fill_value,
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource* mr);
+
 }  // namespace detail
 }  // namespace strings
 }  // namespace cudf
diff --git a/cpp/include/cudf/strings/repeat_strings.hpp b/cpp/include/cudf/strings/repeat_strings.hpp
index 4023dbc6c84..2b39662456b 100644
--- a/cpp/include/cudf/strings/repeat_strings.hpp
+++ b/cpp/include/cudf/strings/repeat_strings.hpp
@@ -31,7 +31,7 @@ namespace strings {
  * @brief Repeat the given string scalar by a given number of times.
  *
  * For a given string scalar, an output string scalar is generated by repeating the input string by
- * a number of times given by the @p `repeat_times` parameter. If `repeat_times` is not a positve
+ * a number of times given by the @p `repeat_times` parameter. If `repeat_times` is not a positive
  * value, an empty (valid) string scalar will be returned. An invalid input scalar will always
  * result in an invalid output scalar regardless of the value of `repeat_times` parameter.
  *
@@ -42,7 +42,7 @@ namespace strings {
  * out is '123XYZ-123XYZ-123XYZ-'
  * @endcode
  *
- * @throw cudf::logic_error if the size of the ouput string scalar exceeds the maximum value that
+ * @throw cudf::logic_error if the size of the output string scalar exceeds the maximum value that
  *        can be stored by the index type
  *        (i.e., `input.size() * repeat_times > numeric_limits<size_type>::max()`).
  *
@@ -61,7 +61,7 @@ std::unique_ptr<string_scalar> repeat_strings(
  *
  * For a given strings column, an output strings column is generated by repeating each string from
  * the input by a number of times given by the @p `repeat_times` parameter. If `repeat_times` is not
- * a positve value, all the rows of the output strings column will be an empty string. Any null row
+ * a positive value, all the rows of the output strings column will be an empty string. Any null row
  * will result in a null row regardless of the value of `repeat_times` parameter.
  *
  * Note that this function cannot handle the cases when the size of the output column exceeds the
diff --git a/cpp/include/cudf/strings/replace.hpp b/cpp/include/cudf/strings/replace.hpp
index e9091b88b08..40eb796eba7 100644
--- a/cpp/include/cudf/strings/replace.hpp
+++ b/cpp/include/cudf/strings/replace.hpp
@@ -36,7 +36,7 @@ namespace strings {
  * input string. If not found, the output entry is just a copy of the
  * corresponding input string.
  *
- * Specifing an empty string for repl will essentially remove the target
+ * Specifying an empty string for repl will essentially remove the target
  * string if found in each string.
  *
  * Null string entries will return null output string entries.
diff --git a/cpp/include/cudf/strings/split/split.hpp b/cpp/include/cudf/strings/split/split.hpp
index 82b191a8e1b..4978bad3bb3 100644
--- a/cpp/include/cudf/strings/split/split.hpp
+++ b/cpp/include/cudf/strings/split/split.hpp
@@ -139,7 +139,7 @@ std::unique_ptr<table> rsplit(
  *
  * @throw cudf:logic_error if `delimiter` is invalid.
  *
- * @param strings A column of string elements to be splitted.
+ * @param strings A column of string elements to be split.
  * @param delimiter The string to identify split points in each string.
  *        Default of empty string indicates split on whitespace.
  * @param maxsplit Maximum number of splits to perform.
@@ -216,7 +216,7 @@ std::unique_ptr<column> split_record(
  *
  * @throw cudf:logic_error if `delimiter` is invalid.
  *
- * @param strings A column of string elements to be splitted.
+ * @param strings A column of string elements to be split.
  * @param delimiter The string to identify split points in each string.
  *        Default of empty string indicates split on whitespace.
  * @param maxsplit Maximum number of splits to perform.
diff --git a/cpp/include/cudf/strings/string_view.cuh b/cpp/include/cudf/strings/string_view.cuh
index f5ab2046441..238d55d580e 100644
--- a/cpp/include/cudf/strings/string_view.cuh
+++ b/cpp/include/cudf/strings/string_view.cuh
@@ -17,11 +17,16 @@
 #pragma once
 
 #include <cudf/strings/string_view.hpp>
+
+#ifndef __CUDA_ARCH__
 #include <cudf/utilities/error.hpp>
+#endif
 
+// This is defined when including this header in a https://github.com/NVIDIA/jitify
+// or jitify2 source file. The jitify cannot include thrust headers at this time.
+#ifndef CUDF_JIT_UDF
 #include <thrust/count.h>
-#include <thrust/find.h>
-#include <cstdlib>
+#endif
 
 // This file should only include device code logic.
 // Host-only or host/device code should be defined in the string_view.hpp header file.
@@ -41,8 +46,17 @@ __device__ inline size_type characters_in_string(const char* str, size_type byte
 {
   if ((str == 0) || (bytes == 0)) return 0;
   auto ptr = reinterpret_cast<uint8_t const*>(str);
+#ifndef CUDF_JIT_UDF
   return thrust::count_if(
     thrust::seq, ptr, ptr + bytes, [](uint8_t chr) { return is_begin_utf8_char(chr); });
+#else
+  size_type chars = 0;
+  auto const end  = ptr + bytes;
+  while (ptr < end) {
+    chars += is_begin_utf8_char(*ptr++);
+  }
+  return chars;
+#endif
 }
 
 /**
@@ -121,7 +135,8 @@ __device__ inline string_view::const_iterator string_view::const_iterator::opera
 {
   const_iterator tmp(*this);
   size_type adjust = abs(offset);
-  while (adjust-- > 0) offset > 0 ? ++tmp : --tmp;
+  while (adjust-- > 0)
+    offset > 0 ? ++tmp : --tmp;
   return tmp;
 }
 
@@ -129,7 +144,8 @@ __device__ inline string_view::const_iterator& string_view::const_iterator::oper
   string_view::const_iterator::difference_type offset)
 {
   size_type adjust = abs(offset);
-  while (adjust-- > 0) offset > 0 ? operator++() : operator--();
+  while (adjust-- > 0)
+    offset > 0 ? operator++() : operator--();
   return *this;
 }
 
@@ -153,7 +169,8 @@ __device__ inline string_view::const_iterator& string_view::const_iterator::oper
   string_view::const_iterator::difference_type offset)
 {
   size_type adjust = abs(offset);
-  while (adjust-- > 0) offset > 0 ? operator--() : operator++();
+  while (adjust-- > 0)
+    offset > 0 ? operator--() : operator++();
   return *this;
 }
 
@@ -162,7 +179,8 @@ __device__ inline string_view::const_iterator string_view::const_iterator::opera
 {
   const_iterator tmp(*this);
   size_type adjust = abs(offset);
-  while (adjust-- > 0) offset > 0 ? --tmp : ++tmp;
+  while (adjust-- > 0)
+    offset > 0 ? --tmp : ++tmp;
   return tmp;
 }
 
@@ -256,7 +274,8 @@ __device__ inline int string_view::compare(const char* data, size_type bytes) co
   size_type const len1      = size_bytes();
   const unsigned char* ptr1 = reinterpret_cast<const unsigned char*>(this->data());
   const unsigned char* ptr2 = reinterpret_cast<const unsigned char*>(data);
-  size_type idx             = 0;
+  if ((ptr1 == ptr2) && (bytes == len1)) return 0;
+  size_type idx = 0;
   for (; (idx < len1) && (idx < bytes); ++idx) {
     if (*ptr1 != *ptr2) return static_cast<int32_t>(*ptr1) - static_cast<int32_t>(*ptr2);
     ++ptr1;
@@ -327,7 +346,8 @@ __device__ inline size_type string_view::find(const char* str,
   const char* ptr2 = str;
   for (size_type idx = 0; idx < len1; ++idx) {
     bool match = true;
-    for (size_type jdx = 0; match && (jdx < len2); ++jdx) match = (ptr1[jdx] == ptr2[jdx]);
+    for (size_type jdx = 0; match && (jdx < len2); ++jdx)
+      match = (ptr1[jdx] == ptr2[jdx]);
     if (match) return character_offset(idx + spos);
     ptr1++;
   }
@@ -368,7 +388,8 @@ __device__ inline size_type string_view::rfind(const char* str,
   const char* ptr2 = str;
   for (int idx = 0; idx < len1; ++idx) {
     bool match = true;
-    for (size_type jdx = 0; match && (jdx < len2); ++jdx) match = (ptr1[jdx] == ptr2[jdx]);
+    for (size_type jdx = 0; match && (jdx < len2); ++jdx)
+      match = (ptr1[jdx] == ptr2[jdx]);
     if (match) return character_offset(epos - len2 - idx);
     ptr1--;  // go backwards
   }
diff --git a/cpp/include/cudf/strings/string_view.hpp b/cpp/include/cudf/strings/string_view.hpp
index 4b1a901d72f..be182cb0e9d 100644
--- a/cpp/include/cudf/strings/string_view.hpp
+++ b/cpp/include/cudf/strings/string_view.hpp
@@ -15,9 +15,8 @@
  */
 #pragma once
 
-#include <cuda_runtime.h>
-#include <cstddef>
 #include <cudf/types.hpp>
+
 #include <iterator>
 
 /**
@@ -36,12 +35,6 @@ using char_utf8 = uint32_t;  ///< UTF-8 characters are 1-4 bytes
  */
 constexpr cudf::size_type UNKNOWN_STRING_LENGTH{-1};
 
-/**
- * @brief This value is assigned to the _char_width member if the string
- * contains characters of different widths.
- */
-constexpr int8_t VARIABLE_CHAR_WIDTH{0};
-
 /**
  * @brief A non-owning, immutable view of device data that is a variable length
  * char array representing a UTF-8 string.
@@ -417,7 +410,7 @@ CUDA_HOST_DEVICE_CALLABLE size_type to_char_utf8(const char* str, char_utf8& cha
  * @brief Place a char_utf8 value into a char array.
  *
  * @param character Single character
- * @param[out] str Allocated char array with enough space to hold the encoded characer.
+ * @param[out] str Allocated char array with enough space to hold the encoded character.
  * @return The number of bytes in the character
  */
 CUDA_HOST_DEVICE_CALLABLE size_type from_char_utf8(char_utf8 character, char* str)
diff --git a/cpp/include/cudf/table/row_operators.cuh b/cpp/include/cudf/table/row_operators.cuh
index bec5299ab77..d174222b2ff 100644
--- a/cpp/include/cudf/table/row_operators.cuh
+++ b/cpp/include/cudf/table/row_operators.cuh
@@ -191,8 +191,8 @@ class element_equality_comparator {
    */
   template <typename Element,
             std::enable_if_t<cudf::is_equality_comparable<Element, Element>()>* = nullptr>
-  __device__ bool operator()(size_type lhs_element_index, size_type rhs_element_index) const
-    noexcept
+  __device__ bool operator()(size_type lhs_element_index,
+                             size_type rhs_element_index) const noexcept
   {
     if (has_nulls) {
       bool const lhs_is_null{lhs.is_null(lhs_element_index)};
diff --git a/cpp/include/cudf/table/table_device_view.cuh b/cpp/include/cudf/table/table_device_view.cuh
index 7c80c958f92..71e48370ccf 100644
--- a/cpp/include/cudf/table/table_device_view.cuh
+++ b/cpp/include/cudf/table/table_device_view.cuh
@@ -27,7 +27,7 @@
 
 /**
  * @file table_device_view.cuh
- * @brief Table device view class definitons
+ * @brief Table device view class definitions
  */
 
 namespace cudf {
diff --git a/cpp/include/cudf/transform.hpp b/cpp/include/cudf/transform.hpp
index e99e0db21c5..460c62e3598 100644
--- a/cpp/include/cudf/transform.hpp
+++ b/cpp/include/cudf/transform.hpp
@@ -40,7 +40,7 @@ namespace cudf {
  *
  * @param input         An immutable view of the input column to transform
  * @param unary_udf     The PTX/CUDA string of the unary function to apply
- * @param outout_type   The output type that is compatible with the output type in the UDF
+ * @param output_type   The output type that is compatible with the output type in the UDF
  * @param is_ptx        true: the UDF is treated as PTX code; false: the UDF is treated as CUDA code
  * @param mr            Device memory resource used to allocate the returned column's device memory
  * @return              The column resulting from applying the unary function to
@@ -133,7 +133,7 @@ std::pair<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::column>> encode(
  * @param bitmask A device pointer to the bitmask which needs to be converted
  * @param begin_bit position of the bit from which the conversion should start
  * @param end_bit position of the bit before which the conversion should stop
- * @param mr Device memory resource used to allocate the returned columns's device memory
+ * @param mr Device memory resource used to allocate the returned columns' device memory
  * @return A boolean column representing the given mask from [begin_bit, end_bit).
  */
 std::unique_ptr<column> mask_to_bools(
@@ -164,7 +164,7 @@ std::unique_ptr<column> mask_to_bools(
  * row_bit_count(column(x)) >= row_bit_count(gather(column(x)))
  *
  * @param t The table view to perform the computation on.
- * @param mr Device memory resource used to allocate the returned columns's device memory
+ * @param mr Device memory resource used to allocate the returned columns' device memory
  * @return A 32-bit integer column containing the per-row bit counts.
  */
 std::unique_ptr<column> row_bit_count(
diff --git a/cpp/include/cudf/types.hpp b/cpp/include/cudf/types.hpp
index 8116097e38e..e1037efb5c8 100644
--- a/cpp/include/cudf/types.hpp
+++ b/cpp/include/cudf/types.hpp
@@ -18,10 +18,10 @@
 
 #ifdef __CUDACC__
 #define CUDA_HOST_DEVICE_CALLABLE __host__ __device__ inline
-#define CUDA_DEVICE_CALLABLE __device__ inline
+#define CUDA_DEVICE_CALLABLE      __device__ inline
 #else
 #define CUDA_HOST_DEVICE_CALLABLE inline
-#define CUDA_DEVICE_CALLABLE inline
+#define CUDA_DEVICE_CALLABLE      inline
 #endif
 
 #include <cassert>
diff --git a/cpp/include/cudf/utilities/error.hpp b/cpp/include/cudf/utilities/error.hpp
index 15613c8caa7..2036723a6ed 100644
--- a/cpp/include/cudf/utilities/error.hpp
+++ b/cpp/include/cudf/utilities/error.hpp
@@ -37,7 +37,7 @@ struct cuda_error : public std::runtime_error {
 }  // namespace cudf
 
 #define STRINGIFY_DETAIL(x) #x
-#define CUDF_STRINGIFY(x) STRINGIFY_DETAIL(x)
+#define CUDF_STRINGIFY(x)   STRINGIFY_DETAIL(x)
 
 /**
  * @addtogroup utility_error
diff --git a/cpp/include/cudf/utilities/traits.hpp b/cpp/include/cudf/utilities/traits.hpp
index e2f5f6db624..2cdc455e05c 100644
--- a/cpp/include/cudf/utilities/traits.hpp
+++ b/cpp/include/cudf/utilities/traits.hpp
@@ -47,16 +47,20 @@ using void_t = void;
  */
 #define CUDF_ENABLE_IF(...) std::enable_if_t<(__VA_ARGS__)>* = nullptr
 
-template <typename L, typename R, typename = void>
-struct is_relationally_comparable_impl : std::false_type {
-};
-
 template <typename L, typename R>
 using less_comparable = decltype(std::declval<L>() < std::declval<R>());
 
 template <typename L, typename R>
 using greater_comparable = decltype(std::declval<L>() > std::declval<R>());
 
+template <typename L, typename R>
+using equality_comparable = decltype(std::declval<L>() == std::declval<R>());
+
+namespace detail {
+template <typename L, typename R, typename = void>
+struct is_relationally_comparable_impl : std::false_type {
+};
+
 template <typename L, typename R>
 struct is_relationally_comparable_impl<L,
                                        R,
@@ -68,13 +72,26 @@ template <typename L, typename R, typename = void>
 struct is_equality_comparable_impl : std::false_type {
 };
 
-template <typename L, typename R>
-using equality_comparable = decltype(std::declval<L>() == std::declval<R>());
-
 template <typename L, typename R>
 struct is_equality_comparable_impl<L, R, void_t<equality_comparable<L, R>>> : std::true_type {
 };
 
+// has common type
+template <typename AlwaysVoid, typename... Ts>
+struct has_common_type_impl : std::false_type {
+};
+
+template <typename... Ts>
+struct has_common_type_impl<void_t<std::common_type_t<Ts...>>, Ts...> : std::true_type {
+};
+}  // namespace detail
+
+template <typename... Ts>
+using has_common_type = typename detail::has_common_type_impl<void, Ts...>::type;
+
+template <typename... Ts>
+constexpr inline bool has_common_type_v = detail::has_common_type_impl<void, Ts...>::value;
+
 template <typename T>
 using is_timestamp_t = cuda::std::disjunction<std::is_same<cudf::timestamp_D, T>,
                                               std::is_same<cudf::timestamp_s, T>,
@@ -104,7 +121,7 @@ using is_duration_t = cuda::std::disjunction<std::is_same<cudf::duration_D, T>,
 template <typename L, typename R>
 constexpr inline bool is_relationally_comparable()
 {
-  return is_relationally_comparable_impl<L, R>::value;
+  return detail::is_relationally_comparable_impl<L, R>::value;
 }
 
 /**
@@ -122,7 +139,7 @@ constexpr inline bool is_relationally_comparable()
 template <typename L, typename R>
 constexpr inline bool is_equality_comparable()
 {
-  return is_equality_comparable_impl<L, R>::value;
+  return detail::is_equality_comparable_impl<L, R>::value;
 }
 
 /**
diff --git a/cpp/include/cudf/utilities/type_checks.hpp b/cpp/include/cudf/utilities/type_checks.hpp
new file mode 100644
index 00000000000..8d57ab3aaa5
--- /dev/null
+++ b/cpp/include/cudf/utilities/type_checks.hpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column_view.hpp>
+
+namespace cudf {
+
+/**
+ * @brief Compares the type of two `column_view`s
+ *
+ * This function returns true if the type of `lhs` equals that of `rhs`.
+ * - For fixed point types, the scale is compared.
+ * - For dictionary types, the type of the keys are compared if both are
+ *   non-empty columns.
+ * - For lists types, the type of child columns are compared recursively.
+ * - For struct types, the type of each field are compared in order.
+ * - For all other types, the `id` of `data_type` is compared.
+ *
+ * @param lhs The first `column_view` to compare
+ * @param rhs The second `column_view` to compare
+ * @return true if column types match
+ */
+bool column_types_equal(column_view const& lhs, column_view const& rhs);
+
+}  // namespace cudf
diff --git a/cpp/include/cudf_test/base_fixture.hpp b/cpp/include/cudf_test/base_fixture.hpp
index 9fa67dccb52..cd088d81531 100644
--- a/cpp/include/cudf_test/base_fixture.hpp
+++ b/cpp/include/cudf_test/base_fixture.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -42,38 +42,38 @@ namespace test {
  * ```
  */
 class BaseFixture : public ::testing::Test {
-  rmm::mr::device_memory_resource *_mr{rmm::mr::get_current_device_resource()};
+  rmm::mr::device_memory_resource* _mr{rmm::mr::get_current_device_resource()};
 
  public:
   /**
    * @brief Returns pointer to `device_memory_resource` that should be used for
    * all tests inheriting from this fixture
    */
-  rmm::mr::device_memory_resource *mr() { return _mr; }
+  rmm::mr::device_memory_resource* mr() { return _mr; }
 };
 
 template <typename T, typename Enable = void>
 struct uniform_distribution_impl {
 };
 template <typename T>
-struct uniform_distribution_impl<
-  T,
-  std::enable_if_t<std::is_integral<T>::value && not cudf::is_boolean<T>()>> {
+struct uniform_distribution_impl<T, std::enable_if_t<std::is_integral<T>::value>> {
   using type = std::uniform_int_distribution<T>;
 };
 
-template <typename T>
-struct uniform_distribution_impl<T, std::enable_if_t<std::is_floating_point<T>::value>> {
-  using type = std::uniform_real_distribution<T>;
+template <>
+struct uniform_distribution_impl<bool> {
+  using type = std::bernoulli_distribution;
 };
 
 template <typename T>
-struct uniform_distribution_impl<T, std::enable_if_t<cudf::is_boolean<T>()>> {
-  using type = std::bernoulli_distribution;
+struct uniform_distribution_impl<T, std::enable_if_t<std::is_floating_point<T>::value>> {
+  using type = std::uniform_real_distribution<T>;
 };
 
 template <typename T>
-struct uniform_distribution_impl<T, std::enable_if_t<cudf::is_chrono<T>()>> {
+struct uniform_distribution_impl<
+  T,
+  std::enable_if_t<cudf::is_chrono<T>() or cudf::is_fixed_point<T>()>> {
   using type = std::uniform_int_distribution<typename T::rep>;
 };
 
@@ -131,7 +131,8 @@ class UniformRandomGenerator {
    * @param lower Lower bound of the range
    * @param upper Upper bound of the desired range
    */
-  template <typename TL = T, std::enable_if_t<!cudf::is_chrono<TL>()> * = nullptr>
+  template <typename TL                                                          = T,
+            std::enable_if_t<cudf::is_numeric<TL>() && !cudf::is_boolean<TL>()>* = nullptr>
   UniformRandomGenerator(T lower,
                          T upper,
                          uint64_t seed = detail::random_generator_incrementing_seed())
@@ -139,6 +140,14 @@ class UniformRandomGenerator {
   {
   }
 
+  template <typename TL = T, std::enable_if_t<cudf::is_boolean<TL>()>* = nullptr>
+  UniformRandomGenerator(T lower,
+                         T upper,
+                         uint64_t seed = detail::random_generator_incrementing_seed())
+    : dist{0.5}, rng{std::mt19937_64{seed}()}
+  {
+  }
+
   /**
    * @brief Construct a new Uniform Random Generator to generate uniformly
    * random numbers in the range `[upper,lower]`
@@ -146,7 +155,8 @@ class UniformRandomGenerator {
    * @param lower Lower bound of the range
    * @param upper Upper bound of the desired range
    */
-  template <typename TL = T, std::enable_if_t<cudf::is_chrono<TL>()> * = nullptr>
+  template <typename TL                                                            = T,
+            std::enable_if_t<cudf::is_chrono<TL>() or cudf::is_fixed_point<TL>()>* = nullptr>
   UniformRandomGenerator(typename TL::rep lower,
                          typename TL::rep upper,
                          uint64_t seed = detail::random_generator_incrementing_seed())
@@ -157,13 +167,13 @@ class UniformRandomGenerator {
   /**
    * @brief Returns the next random number.
    */
-  template <typename TL = T, std::enable_if_t<!cudf::is_timestamp<TL>()> * = nullptr>
+  template <typename TL = T, std::enable_if_t<!cudf::is_timestamp<TL>()>* = nullptr>
   T generate()
   {
     return T{dist(rng)};
   }
 
-  template <typename TL = T, std::enable_if_t<cudf::is_timestamp<TL>()> * = nullptr>
+  template <typename TL = T, std::enable_if_t<cudf::is_timestamp<TL>()>* = nullptr>
   T generate()
   {
     return T{typename T::duration{dist(rng)}};
@@ -237,7 +247,7 @@ inline auto make_binning()
  * @return Memory resource instance
  */
 inline std::shared_ptr<rmm::mr::device_memory_resource> create_memory_resource(
-  std::string const &allocation_mode)
+  std::string const& allocation_mode)
 {
   if (allocation_mode == "binning") return make_binning();
   if (allocation_mode == "cuda") return make_cuda();
@@ -252,12 +262,12 @@ inline std::shared_ptr<rmm::mr::device_memory_resource> create_memory_resource(
 /**
  * @brief Parses the cuDF test command line options.
  *
- * Currently only supports 'rmm_mode' string paramater, which set the rmm
+ * Currently only supports 'rmm_mode' string parameter, which set the rmm
  * allocation mode. The default value of the parameter is 'pool'.
  *
  * @return Parsing results in the form of unordered map
  */
-inline auto parse_cudf_test_opts(int argc, char **argv)
+inline auto parse_cudf_test_opts(int argc, char** argv)
 {
   try {
     cxxopts::Options options(argv[0], " - cuDF tests command line options");
@@ -265,7 +275,7 @@ inline auto parse_cudf_test_opts(int argc, char **argv)
       "rmm_mode", "RMM allocation mode", cxxopts::value<std::string>()->default_value("pool"));
 
     return options.parse(argc, argv);
-  } catch (const cxxopts::OptionException &e) {
+  } catch (const cxxopts::OptionException& e) {
     CUDF_FAIL("Error parsing command line options");
   }
 }
@@ -281,7 +291,7 @@ inline auto parse_cudf_test_opts(int argc, char **argv)
  * allocation mode used for creating the default memory resource.
  */
 #define CUDF_TEST_PROGRAM_MAIN()                                        \
-  int main(int argc, char **argv)                                       \
+  int main(int argc, char** argv)                                       \
   {                                                                     \
     ::testing::InitGoogleTest(&argc, argv);                             \
     auto const cmd_opts = parse_cudf_test_opts(argc, argv);             \
diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp
index 74d22085b26..a4857552831 100644
--- a/cpp/include/cudf_test/column_wrapper.hpp
+++ b/cpp/include/cudf_test/column_wrapper.hpp
@@ -1239,7 +1239,7 @@ class lists_column_wrapper : public detail::column_wrapper {
 
   /**
    * @brief  Construct a lists column containing a single list of fixed-width
-   * type from an interator range.
+   * type from an iterator range.
    *
    * Example:
    * @code{.cpp}
@@ -1621,7 +1621,7 @@ class lists_column_wrapper : public detail::column_wrapper {
                    std::back_inserter(cols),
                    [&](lists_column_wrapper const& l) -> column_view {
                      // depth mismatch.  attempt to normalize the short column.
-                     // this function will also catch if this is a legitmately broken
+                     // this function will also catch if this is a legitimately broken
                      // set of input
                      if (l.depth < expected_depth) {
                        if (l.root) {
diff --git a/cpp/include/cudf_test/cudf_gtest.hpp b/cpp/include/cudf_test/cudf_gtest.hpp
index b60c94394d1..1e2e44c79d1 100644
--- a/cpp/include/cudf_test/cudf_gtest.hpp
+++ b/cpp/include/cudf_test/cudf_gtest.hpp
@@ -34,10 +34,10 @@
  * redefines them properly.
  */
 
-#define Types Types_NOT_USED
-#define Types0 Types0_NOT_USED
-#define TypeList TypeList_NOT_USED
-#define Templates Templates_NOT_USED
+#define Types      Types_NOT_USED
+#define Types0     Types0_NOT_USED
+#define TypeList   TypeList_NOT_USED
+#define Templates  Templates_NOT_USED
 #define Templates0 Templates0_NOT_USED
 #include <gtest/internal/gtest-type-util.h>
 #undef Types
@@ -104,7 +104,7 @@ struct TypeList<Types<TYPES...>> {
       {                                                             \
         try {                                                       \
           x;                                                        \
-        } catch (const exception &e) {                              \
+        } catch (const exception& e) {                              \
           ASSERT_NE(nullptr, e.what());                             \
           EXPECT_THAT(e.what(), testing::StartsWith((startswith))); \
           EXPECT_THAT(e.what(), testing::EndsWith((endswith)));     \
diff --git a/cpp/include/cudf_test/cxxopts.hpp b/cpp/include/cudf_test/cxxopts.hpp
index 49c551ab2f1..5135fd02e21 100644
--- a/cpp/include/cudf_test/cxxopts.hpp
+++ b/cpp/include/cudf_test/cxxopts.hpp
@@ -89,7 +89,9 @@ inline String& stringAppend(String& s, String a) { return s.append(std::move(a))
 
 inline String& stringAppend(String& s, int n, UChar32 c)
 {
-  for (int i = 0; i != n; ++i) { s.append(c); }
+  for (int i = 0; i != n; ++i) {
+    s.append(c);
+  }
 
   return s;
 }
@@ -1449,7 +1451,9 @@ inline void Options::generate_all_groups_help(String& result) const
   std::vector<std::string> all_groups;
   all_groups.reserve(m_help.size());
 
-  for (auto& group : m_help) { all_groups.push_back(group.first); }
+  for (auto& group : m_help) {
+    all_groups.push_back(group.first);
+  }
 
   generate_group_help(result, all_groups);
 }
diff --git a/cpp/include/cudf_test/file_utilities.hpp b/cpp/include/cudf_test/file_utilities.hpp
index 13394445922..90bf0cd99dc 100644
--- a/cpp/include/cudf_test/file_utilities.hpp
+++ b/cpp/include/cudf_test/file_utilities.hpp
@@ -28,17 +28,17 @@ class temp_directory {
   std::string _path;
 
  public:
-  temp_directory(const std::string &base_name)
+  temp_directory(const std::string& base_name)
   {
     std::string dir_template("/tmp");
-    if (const char *env_p = std::getenv("WORKSPACE")) dir_template = env_p;
+    if (const char* env_p = std::getenv("WORKSPACE")) dir_template = env_p;
     dir_template += "/" + base_name + ".XXXXXX";
-    auto const tmpdirptr = mkdtemp(const_cast<char *>(dir_template.data()));
+    auto const tmpdirptr = mkdtemp(const_cast<char*>(dir_template.data()));
     if (tmpdirptr == nullptr) CUDF_FAIL("Temporary directory creation failure: " + dir_template);
     _path = dir_template + "/";
   }
 
-  static int rm_files(const char *pathname, const struct stat *sbuf, int type, struct FTW *ftwb)
+  static int rm_files(const char* pathname, const struct stat* sbuf, int type, struct FTW* ftwb)
   {
     return std::remove(pathname);
   }
@@ -49,5 +49,5 @@ class temp_directory {
     nftw(_path.c_str(), rm_files, 10, FTW_DEPTH | FTW_MOUNT | FTW_PHYS);
   }
 
-  const std::string &path() const { return _path; }
+  const std::string& path() const { return _path; }
 };
diff --git a/cpp/include/cudf_test/type_list_utilities.hpp b/cpp/include/cudf_test/type_list_utilities.hpp
index a3f771c2f72..1588e3c9be9 100644
--- a/cpp/include/cudf_test/type_list_utilities.hpp
+++ b/cpp/include/cudf_test/type_list_utilities.hpp
@@ -32,7 +32,7 @@
  * template <class T>
  * class TestFixture : ::testing::Test { };
  *
- * TYPED_TEST_CASE(TestFixure, TestTypes);
+ * TYPED_TEST_CASE(TestFixture, TestTypes);
  *
  * TYPED_TEST(TestFixture, mytest){
  *   using Type0 = GetType<TypeParam,0>; // the first type element
@@ -169,7 +169,7 @@ struct ConcatImpl<> {
 };
 
 /**
- * @brief Concantenates compile-time lists of types into a single type list.
+ * @brief Concatenates compile-time lists of types into a single type list.
  *
  * Example:
  * ```
diff --git a/cpp/include/cudf_test/type_lists.hpp b/cpp/include/cudf_test/type_lists.hpp
index aeddafae253..5c1b0c6c458 100644
--- a/cpp/include/cudf_test/type_lists.hpp
+++ b/cpp/include/cudf_test/type_lists.hpp
@@ -59,7 +59,7 @@ constexpr std::array<cudf::type_id, sizeof...(Indices)> types_to_ids_impl(
  * array == {type_id::INT32, type_id::FLOAT};
  * ```
  *
- * @tparam TYPES List of types to conver to `type_id`s
+ * @tparam TYPES List of types to convert to `type_id`s
  * @return `std::array` of `type_id`s corresponding to each type in `TYPES`
  */
 template <typename TYPES>
diff --git a/cpp/scripts/run-clang-format.py b/cpp/scripts/run-clang-format.py
index 2a7b66d4f77..c32e984278f 100755
--- a/cpp/scripts/run-clang-format.py
+++ b/cpp/scripts/run-clang-format.py
@@ -22,7 +22,7 @@
 import sys
 import tempfile
 
-EXPECTED_VERSION = "8.0.1"
+EXPECTED_VERSION = "11.0.0"
 VERSION_REGEX = re.compile(r"clang-format version ([0-9.]+)")
 # NOTE: populate this list with more top-level dirs as we add more of them to
 # the cudf repo
diff --git a/cpp/src/aggregation/aggregation.cpp b/cpp/src/aggregation/aggregation.cpp
index f0fd865f685..53a55351f8e 100644
--- a/cpp/src/aggregation/aggregation.cpp
+++ b/cpp/src/aggregation/aggregation.cpp
@@ -88,6 +88,12 @@ std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
   return visit(col_type, static_cast<aggregation const&>(agg));
 }
 
+std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
+  data_type col_type, m2_aggregation const& agg)
+{
+  return visit(col_type, static_cast<aggregation const&>(agg));
+}
+
 std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
   data_type col_type, var_aggregation const& agg)
 {
@@ -155,25 +161,31 @@ std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
 }
 
 std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
-  data_type col_type, merge_lists_aggregation const& agg)
+  data_type col_type, lead_lag_aggregation const& agg)
 {
   return visit(col_type, static_cast<aggregation const&>(agg));
 }
 
 std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
-  data_type col_type, merge_sets_aggregation const& agg)
+  data_type col_type, udf_aggregation const& agg)
 {
   return visit(col_type, static_cast<aggregation const&>(agg));
 }
 
 std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
-  data_type col_type, lead_lag_aggregation const& agg)
+  data_type col_type, merge_lists_aggregation const& agg)
 {
   return visit(col_type, static_cast<aggregation const&>(agg));
 }
 
 std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
-  data_type col_type, udf_aggregation const& agg)
+  data_type col_type, merge_sets_aggregation const& agg)
+{
+  return visit(col_type, static_cast<aggregation const&>(agg));
+}
+
+std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
+  data_type col_type, merge_m2_aggregation const& agg)
 {
   return visit(col_type, static_cast<aggregation const&>(agg));
 }
@@ -227,6 +239,11 @@ void aggregation_finalizer::visit(mean_aggregation const& agg)
   visit(static_cast<aggregation const&>(agg));
 }
 
+void aggregation_finalizer::visit(m2_aggregation const& agg)
+{
+  visit(static_cast<aggregation const&>(agg));
+}
+
 void aggregation_finalizer::visit(var_aggregation const& agg)
 {
   visit(static_cast<aggregation const&>(agg));
@@ -282,22 +299,27 @@ void aggregation_finalizer::visit(collect_set_aggregation const& agg)
   visit(static_cast<aggregation const&>(agg));
 }
 
-void aggregation_finalizer::visit(merge_lists_aggregation const& agg)
+void aggregation_finalizer::visit(lead_lag_aggregation const& agg)
 {
   visit(static_cast<aggregation const&>(agg));
 }
 
-void aggregation_finalizer::visit(merge_sets_aggregation const& agg)
+void aggregation_finalizer::visit(udf_aggregation const& agg)
 {
   visit(static_cast<aggregation const&>(agg));
 }
 
-void aggregation_finalizer::visit(lead_lag_aggregation const& agg)
+void aggregation_finalizer::visit(merge_lists_aggregation const& agg)
 {
   visit(static_cast<aggregation const&>(agg));
 }
 
-void aggregation_finalizer::visit(udf_aggregation const& agg)
+void aggregation_finalizer::visit(merge_sets_aggregation const& agg)
+{
+  visit(static_cast<aggregation const&>(agg));
+}
+
+void aggregation_finalizer::visit(merge_m2_aggregation const& agg)
 {
   visit(static_cast<aggregation const&>(agg));
 }
@@ -311,7 +333,7 @@ std::vector<std::unique_ptr<aggregation>> aggregation::get_simple_aggregations(
 }
 
 /// Factory to create a SUM aggregation
-template <typename Base = aggregation>
+template <typename Base>
 std::unique_ptr<Base> make_sum_aggregation()
 {
   return std::make_unique<detail::sum_aggregation>();
@@ -320,7 +342,7 @@ template std::unique_ptr<aggregation> make_sum_aggregation<aggregation>();
 template std::unique_ptr<rolling_aggregation> make_sum_aggregation<rolling_aggregation>();
 
 /// Factory to create a PRODUCT aggregation
-template <typename Base = aggregation>
+template <typename Base>
 std::unique_ptr<Base> make_product_aggregation()
 {
   return std::make_unique<detail::product_aggregation>();
@@ -328,7 +350,7 @@ std::unique_ptr<Base> make_product_aggregation()
 template std::unique_ptr<aggregation> make_product_aggregation<aggregation>();
 
 /// Factory to create a MIN aggregation
-template <typename Base = aggregation>
+template <typename Base>
 std::unique_ptr<Base> make_min_aggregation()
 {
   return std::make_unique<detail::min_aggregation>();
@@ -337,7 +359,7 @@ template std::unique_ptr<aggregation> make_min_aggregation<aggregation>();
 template std::unique_ptr<rolling_aggregation> make_min_aggregation<rolling_aggregation>();
 
 /// Factory to create a MAX aggregation
-template <typename Base = aggregation>
+template <typename Base>
 std::unique_ptr<Base> make_max_aggregation()
 {
   return std::make_unique<detail::max_aggregation>();
@@ -346,7 +368,7 @@ template std::unique_ptr<aggregation> make_max_aggregation<aggregation>();
 template std::unique_ptr<rolling_aggregation> make_max_aggregation<rolling_aggregation>();
 
 /// Factory to create a COUNT aggregation
-template <typename Base = aggregation>
+template <typename Base>
 std::unique_ptr<Base> make_count_aggregation(null_policy null_handling)
 {
   auto kind =
@@ -359,7 +381,7 @@ template std::unique_ptr<rolling_aggregation> make_count_aggregation<rolling_agg
   null_policy null_handling);
 
 /// Factory to create a ANY aggregation
-template <typename Base = aggregation>
+template <typename Base>
 std::unique_ptr<Base> make_any_aggregation()
 {
   return std::make_unique<detail::any_aggregation>();
@@ -367,7 +389,7 @@ std::unique_ptr<Base> make_any_aggregation()
 template std::unique_ptr<aggregation> make_any_aggregation<aggregation>();
 
 /// Factory to create a ALL aggregation
-template <typename Base = aggregation>
+template <typename Base>
 std::unique_ptr<Base> make_all_aggregation()
 {
   return std::make_unique<detail::all_aggregation>();
@@ -375,7 +397,7 @@ std::unique_ptr<Base> make_all_aggregation()
 template std::unique_ptr<aggregation> make_all_aggregation<aggregation>();
 
 /// Factory to create a SUM_OF_SQUARES aggregation
-template <typename Base = aggregation>
+template <typename Base>
 std::unique_ptr<Base> make_sum_of_squares_aggregation()
 {
   return std::make_unique<detail::sum_of_squares_aggregation>();
@@ -383,7 +405,7 @@ std::unique_ptr<Base> make_sum_of_squares_aggregation()
 template std::unique_ptr<aggregation> make_sum_of_squares_aggregation<aggregation>();
 
 /// Factory to create a MEAN aggregation
-template <typename Base = aggregation>
+template <typename Base>
 std::unique_ptr<Base> make_mean_aggregation()
 {
   return std::make_unique<detail::mean_aggregation>();
@@ -391,8 +413,16 @@ std::unique_ptr<Base> make_mean_aggregation()
 template std::unique_ptr<aggregation> make_mean_aggregation<aggregation>();
 template std::unique_ptr<rolling_aggregation> make_mean_aggregation<rolling_aggregation>();
 
+/// Factory to create a M2 aggregation
+template <typename Base>
+std::unique_ptr<Base> make_m2_aggregation()
+{
+  return std::make_unique<detail::m2_aggregation>();
+}
+template std::unique_ptr<aggregation> make_m2_aggregation<aggregation>();
+
 /// Factory to create a VARIANCE aggregation
-template <typename Base = aggregation>
+template <typename Base>
 std::unique_ptr<Base> make_variance_aggregation(size_type ddof)
 {
   return std::make_unique<detail::var_aggregation>(ddof);
@@ -400,7 +430,7 @@ std::unique_ptr<Base> make_variance_aggregation(size_type ddof)
 template std::unique_ptr<aggregation> make_variance_aggregation<aggregation>(size_type ddof);
 
 /// Factory to create a STD aggregation
-template <typename Base = aggregation>
+template <typename Base>
 std::unique_ptr<Base> make_std_aggregation(size_type ddof)
 {
   return std::make_unique<detail::std_aggregation>(ddof);
@@ -408,7 +438,7 @@ std::unique_ptr<Base> make_std_aggregation(size_type ddof)
 template std::unique_ptr<aggregation> make_std_aggregation<aggregation>(size_type ddof);
 
 /// Factory to create a MEDIAN aggregation
-template <typename Base = aggregation>
+template <typename Base>
 std::unique_ptr<Base> make_median_aggregation()
 {
   return std::make_unique<detail::median_aggregation>();
@@ -416,7 +446,7 @@ std::unique_ptr<Base> make_median_aggregation()
 template std::unique_ptr<aggregation> make_median_aggregation<aggregation>();
 
 /// Factory to create a QUANTILE aggregation
-template <typename Base = aggregation>
+template <typename Base>
 std::unique_ptr<Base> make_quantile_aggregation(std::vector<double> const& q, interpolation i)
 {
   return std::make_unique<detail::quantile_aggregation>(q, i);
@@ -425,7 +455,7 @@ template std::unique_ptr<aggregation> make_quantile_aggregation<aggregation>(
   std::vector<double> const& q, interpolation i);
 
 /// Factory to create an ARGMAX aggregation
-template <typename Base = aggregation>
+template <typename Base>
 std::unique_ptr<Base> make_argmax_aggregation()
 {
   return std::make_unique<detail::argmax_aggregation>();
@@ -434,7 +464,7 @@ template std::unique_ptr<aggregation> make_argmax_aggregation<aggregation>();
 template std::unique_ptr<rolling_aggregation> make_argmax_aggregation<rolling_aggregation>();
 
 /// Factory to create an ARGMIN aggregation
-template <typename Base = aggregation>
+template <typename Base>
 std::unique_ptr<Base> make_argmin_aggregation()
 {
   return std::make_unique<detail::argmin_aggregation>();
@@ -443,7 +473,7 @@ template std::unique_ptr<aggregation> make_argmin_aggregation<aggregation>();
 template std::unique_ptr<rolling_aggregation> make_argmin_aggregation<rolling_aggregation>();
 
 /// Factory to create an NUNIQUE aggregation
-template <typename Base = aggregation>
+template <typename Base>
 std::unique_ptr<Base> make_nunique_aggregation(null_policy null_handling)
 {
   return std::make_unique<detail::nunique_aggregation>(null_handling);
@@ -452,7 +482,7 @@ template std::unique_ptr<aggregation> make_nunique_aggregation<aggregation>(
   null_policy null_handling);
 
 /// Factory to create an NTH_ELEMENT aggregation
-template <typename Base = aggregation>
+template <typename Base>
 std::unique_ptr<Base> make_nth_element_aggregation(size_type n, null_policy null_handling)
 {
   return std::make_unique<detail::nth_element_aggregation>(n, null_handling);
@@ -461,7 +491,7 @@ template std::unique_ptr<aggregation> make_nth_element_aggregation<aggregation>(
   size_type n, null_policy null_handling);
 
 /// Factory to create a ROW_NUMBER aggregation
-template <typename Base = aggregation>
+template <typename Base>
 std::unique_ptr<Base> make_row_number_aggregation()
 {
   return std::make_unique<detail::row_number_aggregation>();
@@ -470,7 +500,7 @@ template std::unique_ptr<aggregation> make_row_number_aggregation<aggregation>()
 template std::unique_ptr<rolling_aggregation> make_row_number_aggregation<rolling_aggregation>();
 
 /// Factory to create a COLLECT_LIST aggregation
-template <typename Base = aggregation>
+template <typename Base>
 std::unique_ptr<Base> make_collect_list_aggregation(null_policy null_handling)
 {
   return std::make_unique<detail::collect_list_aggregation>(null_handling);
@@ -481,7 +511,7 @@ template std::unique_ptr<rolling_aggregation> make_collect_list_aggregation<roll
   null_policy null_handling);
 
 /// Factory to create a COLLECT_SET aggregation
-template <typename Base = aggregation>
+template <typename Base>
 std::unique_ptr<Base> make_collect_set_aggregation(null_policy null_handling,
                                                    null_equality nulls_equal,
                                                    nan_equality nans_equal)
@@ -493,26 +523,8 @@ template std::unique_ptr<aggregation> make_collect_set_aggregation<aggregation>(
 template std::unique_ptr<rolling_aggregation> make_collect_set_aggregation<rolling_aggregation>(
   null_policy null_handling, null_equality nulls_equal, nan_equality nans_equal);
 
-/// Factory to create a MERGE_LISTS aggregation
-template <typename Base = aggregation>
-std::unique_ptr<Base> make_merge_lists_aggregation()
-{
-  return std::make_unique<detail::merge_lists_aggregation>();
-}
-template std::unique_ptr<aggregation> make_merge_lists_aggregation<aggregation>();
-
-/// Factory to create a MERGE_SETS aggregation
-template <typename Base = aggregation>
-std::unique_ptr<Base> make_merge_sets_aggregation(null_equality nulls_equal,
-                                                  nan_equality nans_equal)
-{
-  return std::make_unique<detail::merge_sets_aggregation>(nulls_equal, nans_equal);
-}
-template std::unique_ptr<aggregation> make_merge_sets_aggregation<aggregation>(null_equality,
-                                                                               nan_equality);
-
 /// Factory to create a LAG aggregation
-template <typename Base = aggregation>
+template <typename Base>
 std::unique_ptr<Base> make_lag_aggregation(size_type offset)
 {
   return std::make_unique<detail::lead_lag_aggregation>(aggregation::LAG, offset);
@@ -522,7 +534,7 @@ template std::unique_ptr<rolling_aggregation> make_lag_aggregation<rolling_aggre
   size_type offset);
 
 /// Factory to create a LEAD aggregation
-template <typename Base = aggregation>
+template <typename Base>
 std::unique_ptr<Base> make_lead_aggregation(size_type offset)
 {
   return std::make_unique<detail::lead_lag_aggregation>(aggregation::LEAD, offset);
@@ -532,7 +544,7 @@ template std::unique_ptr<rolling_aggregation> make_lead_aggregation<rolling_aggr
   size_type offset);
 
 /// Factory to create a UDF aggregation
-template <typename Base = aggregation>
+template <typename Base>
 std::unique_ptr<Base> make_udf_aggregation(udf_type type,
                                            std::string const& user_defined_aggregator,
                                            data_type output_type)
@@ -548,6 +560,32 @@ template std::unique_ptr<aggregation> make_udf_aggregation<aggregation>(
 template std::unique_ptr<rolling_aggregation> make_udf_aggregation<rolling_aggregation>(
   udf_type type, std::string const& user_defined_aggregator, data_type output_type);
 
+/// Factory to create a MERGE_LISTS aggregation
+template <typename Base>
+std::unique_ptr<Base> make_merge_lists_aggregation()
+{
+  return std::make_unique<detail::merge_lists_aggregation>();
+}
+template std::unique_ptr<aggregation> make_merge_lists_aggregation<aggregation>();
+
+/// Factory to create a MERGE_SETS aggregation
+template <typename Base>
+std::unique_ptr<Base> make_merge_sets_aggregation(null_equality nulls_equal,
+                                                  nan_equality nans_equal)
+{
+  return std::make_unique<detail::merge_sets_aggregation>(nulls_equal, nans_equal);
+}
+template std::unique_ptr<aggregation> make_merge_sets_aggregation<aggregation>(null_equality,
+                                                                               nan_equality);
+
+/// Factory to create a MERGE_M2 aggregation
+template <typename Base>
+std::unique_ptr<Base> make_merge_m2_aggregation()
+{
+  return std::make_unique<detail::merge_m2_aggregation>();
+}
+template std::unique_ptr<aggregation> make_merge_m2_aggregation<aggregation>();
+
 namespace detail {
 namespace {
 struct target_type_functor {
diff --git a/cpp/src/ast/linearizer.cpp b/cpp/src/ast/linearizer.cpp
index 66a32ead35e..3e442305552 100644
--- a/cpp/src/ast/linearizer.cpp
+++ b/cpp/src/ast/linearizer.cpp
@@ -111,7 +111,9 @@ cudf::size_type linearizer::visit(column_reference const& expr)
   // Increment the node index
   _node_count++;
   // Resolve node type
-  auto const data_type = expr.get_data_type(_table);
+  auto const data_type = expr.get_table_source() == table_reference::LEFT
+                           ? expr.get_data_type(_left)
+                           : expr.get_data_type(_right);
   // Push data reference
   auto const source = detail::device_data_reference(detail::device_data_reference_type::COLUMN,
                                                     data_type,
diff --git a/cpp/src/ast/transform.cu b/cpp/src/ast/transform.cu
index 43d3bde97c2..7aa89635c54 100644
--- a/cpp/src/ast/transform.cu
+++ b/cpp/src/ast/transform.cu
@@ -49,37 +49,37 @@ namespace detail {
  * This evaluates an expression over a table to produce a new column. Also called an n-ary
  * transform.
  *
- * @tparam block_size
+ * @tparam max_block_size The size of the thread block, used to set launch
+ * bounds and minimize register usage.
+ * @tparam has_nulls whether or not the output column may contain nulls.
+ *
  * @param table The table device view used for evaluation.
- * @param literals Array of literal values used for evaluation.
- * @param output_column The output column where results are stored.
- * @param data_references Array of data references.
- * @param operators Array of operators to perform.
- * @param operator_source_indices Array of source indices for the operators.
- * @param num_operators Number of operators.
- * @param num_intermediates Number of intermediates, used to allocate a portion of shared memory to
- * each thread.
+ * @param plan Container of device data required to evaluate the desired expression.
+ * @param output_column The destination for the results of evaluating the expression.
  */
-template <cudf::size_type max_block_size>
-__launch_bounds__(max_block_size) __global__ void compute_column_kernel(
-  table_device_view const table,
-  device_span<const cudf::detail::fixed_width_scalar_device_view_base> literals,
-  mutable_column_device_view output_column,
-  device_span<const detail::device_data_reference> data_references,
-  device_span<const ast_operator> operators,
-  device_span<const cudf::size_type> operator_source_indices,
-  cudf::size_type num_intermediates)
+template <cudf::size_type max_block_size, bool has_nulls>
+__launch_bounds__(max_block_size) __global__
+  void compute_column_kernel(table_device_view const table,
+                             device_ast_plan plan,
+                             mutable_column_device_view output_column)
 {
-  extern __shared__ std::int64_t intermediate_storage[];
-  auto thread_intermediate_storage = &intermediate_storage[threadIdx.x * num_intermediates];
+  // The (required) extern storage of the shared memory array leads to
+  // conflicting declarations between different templates. The easiest
+  // workaround is to declare an arbitrary (here char) array type then cast it
+  // after the fact to the appropriate type.
+  extern __shared__ char raw_intermediate_storage[];
+  IntermediateDataType<has_nulls>* intermediate_storage =
+    reinterpret_cast<IntermediateDataType<has_nulls>*>(raw_intermediate_storage);
+
+  auto thread_intermediate_storage = &intermediate_storage[threadIdx.x * plan.num_intermediates];
   auto const start_idx = static_cast<cudf::size_type>(threadIdx.x + blockIdx.x * blockDim.x);
   auto const stride    = static_cast<cudf::size_type>(blockDim.x * gridDim.x);
-  auto const evaluator =
-    cudf::ast::detail::row_evaluator(table, literals, thread_intermediate_storage, &output_column);
+  auto evaluator =
+    cudf::ast::detail::expression_evaluator<has_nulls>(table, plan, thread_intermediate_storage);
 
   for (cudf::size_type row_index = start_idx; row_index < table.num_rows(); row_index += stride) {
-    evaluate_row_expression(
-      evaluator, data_references, operators, operator_source_indices, row_index);
+    auto output_dest = mutable_column_expression_result<has_nulls>(output_column);
+    evaluator.evaluate(output_dest, row_index);
   }
 }
 
@@ -88,22 +88,30 @@ std::unique_ptr<column> compute_column(table_view const table,
                                        rmm::cuda_stream_view stream,
                                        rmm::mr::device_memory_resource* mr)
 {
-  auto const expr_linearizer = linearizer(expr, table);                // Linearize the AST
-  auto const plan            = ast_plan{expr_linearizer, stream, mr};  // Create ast_plan
+  // Prepare output column. Whether or not the output column is nullable is
+  // determined by whether any of the columns in the input table are nullable.
+  // If none of the input columns actually contain nulls, we can still use the
+  // non-nullable version of the expression evaluation code path for
+  // performance, so we capture that information as well.
+  auto const nullable =
+    std::any_of(table.begin(), table.end(), [](column_view c) { return c.nullable(); });
+  auto const has_nulls = nullable && std::any_of(table.begin(), table.end(), [](column_view c) {
+                           return c.nullable() && c.has_nulls();
+                         });
 
-  // Create table device view
-  auto table_device         = table_device_view::create(table, stream);
-  auto const table_num_rows = table.num_rows();
+  auto const plan = ast_plan{expr, table, has_nulls, stream, mr};
+
+  auto const output_column_mask_state =
+    nullable ? (has_nulls ? mask_state::UNINITIALIZED : mask_state::ALL_VALID)
+             : mask_state::UNALLOCATED;
 
-  // Prepare output column
   auto output_column = cudf::make_fixed_width_column(
-    expr_linearizer.root_data_type(), table_num_rows, mask_state::UNALLOCATED, stream, mr);
+    plan.output_type(), table.num_rows(), output_column_mask_state, stream, mr);
   auto mutable_output_device =
     cudf::mutable_column_device_view::create(output_column->mutable_view(), stream);
 
   // Configure kernel parameters
-  auto const num_intermediates     = expr_linearizer.intermediate_count();
-  auto const shmem_size_per_thread = static_cast<int>(sizeof(std::int64_t) * num_intermediates);
+  auto const& dev_plan = plan.dev_plan;
   int device_id;
   CUDA_TRY(cudaGetDevice(&device_id));
   int shmem_limit_per_block;
@@ -111,22 +119,23 @@ std::unique_ptr<column> compute_column(table_view const table,
     cudaDeviceGetAttribute(&shmem_limit_per_block, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
   auto constexpr MAX_BLOCK_SIZE = 128;
   auto const block_size =
-    shmem_size_per_thread != 0
-      ? std::min(MAX_BLOCK_SIZE, shmem_limit_per_block / shmem_size_per_thread)
+    dev_plan.shmem_per_thread != 0
+      ? std::min(MAX_BLOCK_SIZE, shmem_limit_per_block / dev_plan.shmem_per_thread)
       : MAX_BLOCK_SIZE;
-  auto const config               = cudf::detail::grid_1d{table_num_rows, block_size};
-  auto const shmem_size_per_block = shmem_size_per_thread * config.num_threads_per_block;
+  auto const config          = cudf::detail::grid_1d{table.num_rows(), block_size};
+  auto const shmem_per_block = dev_plan.shmem_per_thread * config.num_threads_per_block;
 
   // Execute the kernel
-  cudf::ast::detail::compute_column_kernel<MAX_BLOCK_SIZE>
-    <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
-      *table_device,
-      plan._device_literals,
-      *mutable_output_device,
-      plan._device_data_references,
-      plan._device_operators,
-      plan._device_operator_source_indices,
-      num_intermediates);
+  auto table_device = table_device_view::create(table, stream);
+  if (has_nulls) {
+    cudf::ast::detail::compute_column_kernel<MAX_BLOCK_SIZE, true>
+      <<<config.num_blocks, config.num_threads_per_block, shmem_per_block, stream.value()>>>(
+        *table_device, dev_plan, *mutable_output_device);
+  } else {
+    cudf::ast::detail::compute_column_kernel<MAX_BLOCK_SIZE, false>
+      <<<config.num_blocks, config.num_threads_per_block, shmem_per_block, stream.value()>>>(
+        *table_device, dev_plan, *mutable_output_device);
+  }
   CHECK_CUDA(stream.value());
   return output_column;
 }
diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp
index 11a3383ee87..aaf193ff5cf 100644
--- a/cpp/src/binaryop/binaryop.cpp
+++ b/cpp/src/binaryop/binaryop.cpp
@@ -588,7 +588,7 @@ std::unique_ptr<column> binary_operation(scalar const& lhs,
                                          rmm::mr::device_memory_resource* mr)
 {
   if (lhs.type().id() == type_id::STRING and rhs.type().id() == type_id::STRING)
-    return binops::compiled::binary_operation(lhs, rhs, op, output_type, stream, mr);
+    return experimental::binary_operation(lhs, rhs, op, output_type, mr);
 
   if (is_fixed_point(lhs.type()) or is_fixed_point(rhs.type()))
     return fixed_point_binary_operation(lhs, rhs, op, output_type, stream, mr);
@@ -615,7 +615,7 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,
                                          rmm::mr::device_memory_resource* mr)
 {
   if (lhs.type().id() == type_id::STRING and rhs.type().id() == type_id::STRING)
-    return binops::compiled::binary_operation(lhs, rhs, op, output_type, stream, mr);
+    return experimental::binary_operation(lhs, rhs, op, output_type, mr);
 
   if (is_fixed_point(lhs.type()) or is_fixed_point(rhs.type()))
     return fixed_point_binary_operation(lhs, rhs, op, output_type, stream, mr);
@@ -644,7 +644,7 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,
   CUDF_EXPECTS(lhs.size() == rhs.size(), "Column sizes don't match");
 
   if (lhs.type().id() == type_id::STRING and rhs.type().id() == type_id::STRING)
-    return binops::compiled::binary_operation(lhs, rhs, op, output_type, stream, mr);
+    return experimental::binary_operation(lhs, rhs, op, output_type, mr);
 
   if (is_fixed_point(lhs.type()) or is_fixed_point(rhs.type()))
     return fixed_point_binary_operation(lhs, rhs, op, output_type, stream, mr);
@@ -757,4 +757,78 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,
   return detail::binary_operation(lhs, rhs, ptx, output_type, rmm::cuda_stream_default, mr);
 }
 
+// Experimental Compiled Binary operation
+namespace experimental {
+namespace detail {
+/**
+ * @copydoc cudf::experimental::binary_operation(column_view const&, column_view const&,
+ * binary_operator, data_type, rmm::mr::device_memory_resource*)
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
+template <typename LhsType, typename RhsType>
+std::unique_ptr<column> binary_operation(LhsType const& lhs,
+                                         RhsType const& rhs,
+                                         binary_operator op,
+                                         data_type output_type,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr)
+{
+  if constexpr (std::is_same_v<LhsType, column_view> and std::is_same_v<RhsType, column_view>)
+    CUDF_EXPECTS(lhs.size() == rhs.size(), "Column sizes don't match");
+
+  if (lhs.type().id() == type_id::STRING and rhs.type().id() == type_id::STRING and
+      output_type.id() == type_id::STRING and
+      (op == binary_operator::NULL_MAX or op == binary_operator::NULL_MIN))
+    return binops::compiled::string_null_min_max(lhs, rhs, op, output_type, stream, mr);
+
+  if (not binops::compiled::is_supported_operation(output_type, lhs.type(), rhs.type(), op))
+    CUDF_FAIL("Unsupported operator for these types");
+
+  // TODO check if scale conversion required?
+  // if (is_fixed_point(lhs.type()) or is_fixed_point(rhs.type()))
+  //  CUDF_FAIL("Not yet supported fixed_point");
+  // return fixed_point_binary_operation(lhs, rhs, op, output_type, stream, mr);
+
+  auto out = make_fixed_width_column_for_output(lhs, rhs, op, output_type, stream, mr);
+
+  if constexpr (std::is_same_v<LhsType, column_view>)
+    if (lhs.is_empty()) return out;
+  if constexpr (std::is_same_v<RhsType, column_view>)
+    if (rhs.is_empty()) return out;
+
+  auto out_view = out->mutable_view();
+  cudf::binops::compiled::binary_operation(out_view, lhs, rhs, op, stream);
+  return out;
+}
+}  // namespace detail
+
+std::unique_ptr<column> binary_operation(scalar const& lhs,
+                                         column_view const& rhs,
+                                         binary_operator op,
+                                         data_type output_type,
+                                         rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::binary_operation(lhs, rhs, op, output_type, rmm::cuda_stream_default, mr);
+}
+std::unique_ptr<column> binary_operation(column_view const& lhs,
+                                         scalar const& rhs,
+                                         binary_operator op,
+                                         data_type output_type,
+                                         rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::binary_operation(lhs, rhs, op, output_type, rmm::cuda_stream_default, mr);
+}
+std::unique_ptr<column> binary_operation(column_view const& lhs,
+                                         column_view const& rhs,
+                                         binary_operator op,
+                                         data_type output_type,
+                                         rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::binary_operation(lhs, rhs, op, output_type, rmm::cuda_stream_default, mr);
+}
+}  // namespace experimental
 }  // namespace cudf
diff --git a/cpp/src/binaryop/compiled/ATan2.cu b/cpp/src/binaryop/compiled/ATan2.cu
new file mode 100644
index 00000000000..8e5cbf57f55
--- /dev/null
+++ b/cpp/src/binaryop/compiled/ATan2.cu
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "binary_ops.cuh"
+
+namespace cudf::binops::compiled {
+template void apply_binary_op<ops::ATan2>(mutable_column_device_view&,
+                                          column_device_view const&,
+                                          column_device_view const&,
+                                          bool is_lhs_scalar,
+                                          bool is_rhs_scalar,
+                                          rmm::cuda_stream_view);
+}
diff --git a/cpp/src/binaryop/compiled/Add.cu b/cpp/src/binaryop/compiled/Add.cu
new file mode 100644
index 00000000000..4cd2ced66f4
--- /dev/null
+++ b/cpp/src/binaryop/compiled/Add.cu
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "binary_ops.cuh"
+
+namespace cudf::binops::compiled {
+template void apply_binary_op<ops::Add>(mutable_column_device_view&,
+                                        column_device_view const&,
+                                        column_device_view const&,
+                                        bool is_lhs_scalar,
+                                        bool is_rhs_scalar,
+                                        rmm::cuda_stream_view);
+}
diff --git a/cpp/src/binaryop/compiled/BitwiseAnd.cu b/cpp/src/binaryop/compiled/BitwiseAnd.cu
new file mode 100644
index 00000000000..6abac2bd197
--- /dev/null
+++ b/cpp/src/binaryop/compiled/BitwiseAnd.cu
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "binary_ops.cuh"
+
+namespace cudf::binops::compiled {
+template void apply_binary_op<ops::BitwiseAnd>(mutable_column_device_view&,
+                                               column_device_view const&,
+                                               column_device_view const&,
+                                               bool is_lhs_scalar,
+                                               bool is_rhs_scalar,
+                                               rmm::cuda_stream_view);
+}
diff --git a/cpp/src/binaryop/compiled/BitwiseOr.cu b/cpp/src/binaryop/compiled/BitwiseOr.cu
new file mode 100644
index 00000000000..6d523cbf1d1
--- /dev/null
+++ b/cpp/src/binaryop/compiled/BitwiseOr.cu
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "binary_ops.cuh"
+
+namespace cudf::binops::compiled {
+template void apply_binary_op<ops::BitwiseOr>(mutable_column_device_view&,
+                                              column_device_view const&,
+                                              column_device_view const&,
+                                              bool is_lhs_scalar,
+                                              bool is_rhs_scalar,
+                                              rmm::cuda_stream_view);
+}
diff --git a/cpp/src/binaryop/compiled/BitwiseXor.cu b/cpp/src/binaryop/compiled/BitwiseXor.cu
new file mode 100644
index 00000000000..45175681574
--- /dev/null
+++ b/cpp/src/binaryop/compiled/BitwiseXor.cu
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "binary_ops.cuh"
+
+namespace cudf::binops::compiled {
+template void apply_binary_op<ops::BitwiseXor>(mutable_column_device_view&,
+                                               column_device_view const&,
+                                               column_device_view const&,
+                                               bool is_lhs_scalar,
+                                               bool is_rhs_scalar,
+                                               rmm::cuda_stream_view);
+}
diff --git a/cpp/src/binaryop/compiled/Div.cu b/cpp/src/binaryop/compiled/Div.cu
new file mode 100644
index 00000000000..7cc895ecd06
--- /dev/null
+++ b/cpp/src/binaryop/compiled/Div.cu
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "binary_ops.cuh"
+
+namespace cudf::binops::compiled {
+template void apply_binary_op<ops::Div>(mutable_column_device_view&,
+                                        column_device_view const&,
+                                        column_device_view const&,
+                                        bool is_lhs_scalar,
+                                        bool is_rhs_scalar,
+                                        rmm::cuda_stream_view);
+}
diff --git a/cpp/src/binaryop/compiled/FloorDiv.cu b/cpp/src/binaryop/compiled/FloorDiv.cu
new file mode 100644
index 00000000000..99ea2706b86
--- /dev/null
+++ b/cpp/src/binaryop/compiled/FloorDiv.cu
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "binary_ops.cuh"
+
+namespace cudf::binops::compiled {
+template void apply_binary_op<ops::FloorDiv>(mutable_column_device_view&,
+                                             column_device_view const&,
+                                             column_device_view const&,
+                                             bool is_lhs_scalar,
+                                             bool is_rhs_scalar,
+                                             rmm::cuda_stream_view);
+}
diff --git a/cpp/src/binaryop/compiled/Greater.cu b/cpp/src/binaryop/compiled/Greater.cu
new file mode 100644
index 00000000000..679e029b5fc
--- /dev/null
+++ b/cpp/src/binaryop/compiled/Greater.cu
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "binary_ops.cuh"
+
+namespace cudf::binops::compiled {
+template void apply_binary_op<ops::Greater>(mutable_column_device_view&,
+                                            column_device_view const&,
+                                            column_device_view const&,
+                                            bool is_lhs_scalar,
+                                            bool is_rhs_scalar,
+                                            rmm::cuda_stream_view);
+}
diff --git a/cpp/src/binaryop/compiled/GreaterEqual.cu b/cpp/src/binaryop/compiled/GreaterEqual.cu
new file mode 100644
index 00000000000..23b0c6aaa0d
--- /dev/null
+++ b/cpp/src/binaryop/compiled/GreaterEqual.cu
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "binary_ops.cuh"
+
+namespace cudf::binops::compiled {
+template void apply_binary_op<ops::GreaterEqual>(mutable_column_device_view&,
+                                                 column_device_view const&,
+                                                 column_device_view const&,
+                                                 bool is_lhs_scalar,
+                                                 bool is_rhs_scalar,
+                                                 rmm::cuda_stream_view);
+}
diff --git a/cpp/src/binaryop/compiled/Less.cu b/cpp/src/binaryop/compiled/Less.cu
new file mode 100644
index 00000000000..7ab5dfe3478
--- /dev/null
+++ b/cpp/src/binaryop/compiled/Less.cu
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "binary_ops.cuh"
+
+namespace cudf::binops::compiled {
+template void apply_binary_op<ops::Less>(mutable_column_device_view&,
+                                         column_device_view const&,
+                                         column_device_view const&,
+                                         bool is_lhs_scalar,
+                                         bool is_rhs_scalar,
+                                         rmm::cuda_stream_view);
+}
diff --git a/cpp/src/binaryop/compiled/LessEqual.cu b/cpp/src/binaryop/compiled/LessEqual.cu
new file mode 100644
index 00000000000..983c50c9575
--- /dev/null
+++ b/cpp/src/binaryop/compiled/LessEqual.cu
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "binary_ops.cuh"
+
+namespace cudf::binops::compiled {
+template void apply_binary_op<ops::LessEqual>(mutable_column_device_view&,
+                                              column_device_view const&,
+                                              column_device_view const&,
+                                              bool is_lhs_scalar,
+                                              bool is_rhs_scalar,
+                                              rmm::cuda_stream_view);
+}
diff --git a/cpp/src/binaryop/compiled/LogBase.cu b/cpp/src/binaryop/compiled/LogBase.cu
new file mode 100644
index 00000000000..bdc709b86bf
--- /dev/null
+++ b/cpp/src/binaryop/compiled/LogBase.cu
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "binary_ops.cuh"
+
+namespace cudf::binops::compiled {
+template void apply_binary_op<ops::LogBase>(mutable_column_device_view&,
+                                            column_device_view const&,
+                                            column_device_view const&,
+                                            bool is_lhs_scalar,
+                                            bool is_rhs_scalar,
+                                            rmm::cuda_stream_view);
+}
diff --git a/cpp/src/binaryop/compiled/LogicalAnd.cu b/cpp/src/binaryop/compiled/LogicalAnd.cu
new file mode 100644
index 00000000000..08112fadfff
--- /dev/null
+++ b/cpp/src/binaryop/compiled/LogicalAnd.cu
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "binary_ops.cuh"
+
+namespace cudf::binops::compiled {
+template void apply_binary_op<ops::LogicalAnd>(mutable_column_device_view&,
+                                               column_device_view const&,
+                                               column_device_view const&,
+                                               bool is_lhs_scalar,
+                                               bool is_rhs_scalar,
+                                               rmm::cuda_stream_view);
+}
diff --git a/cpp/src/binaryop/compiled/LogicalOr.cu b/cpp/src/binaryop/compiled/LogicalOr.cu
new file mode 100644
index 00000000000..bc400afd4cd
--- /dev/null
+++ b/cpp/src/binaryop/compiled/LogicalOr.cu
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "binary_ops.cuh"
+
+namespace cudf::binops::compiled {
+template void apply_binary_op<ops::LogicalOr>(mutable_column_device_view&,
+                                              column_device_view const&,
+                                              column_device_view const&,
+                                              bool is_lhs_scalar,
+                                              bool is_rhs_scalar,
+                                              rmm::cuda_stream_view);
+}
diff --git a/cpp/src/binaryop/compiled/Mod.cu b/cpp/src/binaryop/compiled/Mod.cu
new file mode 100644
index 00000000000..0b82c09c8a6
--- /dev/null
+++ b/cpp/src/binaryop/compiled/Mod.cu
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "binary_ops.cuh"
+
+namespace cudf::binops::compiled {
+template void apply_binary_op<ops::Mod>(mutable_column_device_view&,
+                                        column_device_view const&,
+                                        column_device_view const&,
+                                        bool is_lhs_scalar,
+                                        bool is_rhs_scalar,
+                                        rmm::cuda_stream_view);
+}
diff --git a/cpp/src/binaryop/compiled/Mul.cu b/cpp/src/binaryop/compiled/Mul.cu
new file mode 100644
index 00000000000..15394245259
--- /dev/null
+++ b/cpp/src/binaryop/compiled/Mul.cu
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "binary_ops.cuh"
+
+namespace cudf::binops::compiled {
+template void apply_binary_op<ops::Mul>(mutable_column_device_view&,
+                                        column_device_view const&,
+                                        column_device_view const&,
+                                        bool is_lhs_scalar,
+                                        bool is_rhs_scalar,
+                                        rmm::cuda_stream_view);
+}
diff --git a/cpp/src/binaryop/compiled/NullMax.cu b/cpp/src/binaryop/compiled/NullMax.cu
new file mode 100644
index 00000000000..78a44041cba
--- /dev/null
+++ b/cpp/src/binaryop/compiled/NullMax.cu
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "binary_ops.cuh"
+
+namespace cudf::binops::compiled {
+template void apply_binary_op<ops::NullMax>(mutable_column_device_view&,
+                                            column_device_view const&,
+                                            column_device_view const&,
+                                            bool is_lhs_scalar,
+                                            bool is_rhs_scalar,
+                                            rmm::cuda_stream_view);
+}  // namespace cudf::binops::compiled
diff --git a/cpp/src/binaryop/compiled/NullMin.cu b/cpp/src/binaryop/compiled/NullMin.cu
new file mode 100644
index 00000000000..629ab600fd7
--- /dev/null
+++ b/cpp/src/binaryop/compiled/NullMin.cu
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "binary_ops.cuh"
+
+namespace cudf::binops::compiled {
+template void apply_binary_op<ops::NullMin>(mutable_column_device_view&,
+                                            column_device_view const&,
+                                            column_device_view const&,
+                                            bool is_lhs_scalar,
+                                            bool is_rhs_scalar,
+                                            rmm::cuda_stream_view);
+}  // namespace cudf::binops::compiled
diff --git a/cpp/src/binaryop/compiled/PMod.cu b/cpp/src/binaryop/compiled/PMod.cu
new file mode 100644
index 00000000000..36902c0ed10
--- /dev/null
+++ b/cpp/src/binaryop/compiled/PMod.cu
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "binary_ops.cuh"
+
+namespace cudf::binops::compiled {
+template void apply_binary_op<ops::PMod>(mutable_column_device_view&,
+                                         column_device_view const&,
+                                         column_device_view const&,
+                                         bool is_lhs_scalar,
+                                         bool is_rhs_scalar,
+                                         rmm::cuda_stream_view);
+}
diff --git a/cpp/src/binaryop/compiled/Pow.cu b/cpp/src/binaryop/compiled/Pow.cu
new file mode 100644
index 00000000000..c6f897ee18d
--- /dev/null
+++ b/cpp/src/binaryop/compiled/Pow.cu
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "binary_ops.cuh"
+
+namespace cudf::binops::compiled {
+template void apply_binary_op<ops::Pow>(mutable_column_device_view&,
+                                        column_device_view const&,
+                                        column_device_view const&,
+                                        bool is_lhs_scalar,
+                                        bool is_rhs_scalar,
+                                        rmm::cuda_stream_view);
+}
diff --git a/cpp/src/binaryop/compiled/PyMod.cu b/cpp/src/binaryop/compiled/PyMod.cu
new file mode 100644
index 00000000000..b05dcd8e7bc
--- /dev/null
+++ b/cpp/src/binaryop/compiled/PyMod.cu
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "binary_ops.cuh"
+
+namespace cudf::binops::compiled {
+template void apply_binary_op<ops::PyMod>(mutable_column_device_view&,
+                                          column_device_view const&,
+                                          column_device_view const&,
+                                          bool is_lhs_scalar,
+                                          bool is_rhs_scalar,
+                                          rmm::cuda_stream_view);
+}
diff --git a/cpp/src/binaryop/compiled/ShiftLeft.cu b/cpp/src/binaryop/compiled/ShiftLeft.cu
new file mode 100644
index 00000000000..6cc950b2d50
--- /dev/null
+++ b/cpp/src/binaryop/compiled/ShiftLeft.cu
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "binary_ops.cuh"
+
+namespace cudf::binops::compiled {
+template void apply_binary_op<ops::ShiftLeft>(mutable_column_device_view&,
+                                              column_device_view const&,
+                                              column_device_view const&,
+                                              bool is_lhs_scalar,
+                                              bool is_rhs_scalar,
+                                              rmm::cuda_stream_view);
+}
diff --git a/cpp/src/binaryop/compiled/ShiftRight.cu b/cpp/src/binaryop/compiled/ShiftRight.cu
new file mode 100644
index 00000000000..1ddd7100a73
--- /dev/null
+++ b/cpp/src/binaryop/compiled/ShiftRight.cu
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "binary_ops.cuh"
+
+namespace cudf::binops::compiled {
+template void apply_binary_op<ops::ShiftRight>(mutable_column_device_view&,
+                                               column_device_view const&,
+                                               column_device_view const&,
+                                               bool is_lhs_scalar,
+                                               bool is_rhs_scalar,
+                                               rmm::cuda_stream_view);
+}
diff --git a/cpp/src/binaryop/compiled/ShiftRightUnsigned.cu b/cpp/src/binaryop/compiled/ShiftRightUnsigned.cu
new file mode 100644
index 00000000000..a87b4b9f9ac
--- /dev/null
+++ b/cpp/src/binaryop/compiled/ShiftRightUnsigned.cu
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "binary_ops.cuh"
+
+namespace cudf::binops::compiled {
+template void apply_binary_op<ops::ShiftRightUnsigned>(mutable_column_device_view&,
+                                                       column_device_view const&,
+                                                       column_device_view const&,
+                                                       bool is_lhs_scalar,
+                                                       bool is_rhs_scalar,
+                                                       rmm::cuda_stream_view);
+}
diff --git a/cpp/src/binaryop/compiled/Sub.cu b/cpp/src/binaryop/compiled/Sub.cu
new file mode 100644
index 00000000000..e0cf47c1310
--- /dev/null
+++ b/cpp/src/binaryop/compiled/Sub.cu
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "binary_ops.cuh"
+
+namespace cudf::binops::compiled {
+template void apply_binary_op<ops::Sub>(mutable_column_device_view&,
+                                        column_device_view const&,
+                                        column_device_view const&,
+                                        bool is_lhs_scalar,
+                                        bool is_rhs_scalar,
+                                        rmm::cuda_stream_view);
+}
diff --git a/cpp/src/binaryop/compiled/TrueDiv.cu b/cpp/src/binaryop/compiled/TrueDiv.cu
new file mode 100644
index 00000000000..d8f1d956340
--- /dev/null
+++ b/cpp/src/binaryop/compiled/TrueDiv.cu
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "binary_ops.cuh"
+
+namespace cudf::binops::compiled {
+template void apply_binary_op<ops::TrueDiv>(mutable_column_device_view&,
+                                            column_device_view const&,
+                                            column_device_view const&,
+                                            bool is_lhs_scalar,
+                                            bool is_rhs_scalar,
+                                            rmm::cuda_stream_view);
+}
diff --git a/cpp/src/binaryop/compiled/binary_ops.cu b/cpp/src/binaryop/compiled/binary_ops.cu
index 2b24e0cfa3d..1dd00c4b981 100644
--- a/cpp/src/binaryop/compiled/binary_ops.cu
+++ b/cpp/src/binaryop/compiled/binary_ops.cu
@@ -15,13 +15,12 @@
  */
 
 #include "binary_ops.hpp"
+#include "operation.cuh"
 
+#include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
-#include <cudf/detail/iterator.cuh>
-#include <cudf/detail/null_mask.hpp>
 #include <cudf/scalar/scalar_device_view.cuh>
-#include <cudf/table/table_view.hpp>
-#include <cudf/utilities/span.hpp>
+#include <cudf/strings/detail/utilities.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
@@ -32,204 +31,76 @@ namespace binops {
 namespace compiled {
 
 namespace {
-
-template <typename Lhs, typename Rhs, typename Out>
-struct apply_binop {
-  binary_operator op;
-  apply_binop(binary_operator op) : op(op) {}
-  CUDA_DEVICE_CALLABLE Out operator()(Lhs const& x, Rhs const& y) const
-  {
-    switch (op) {
-      case binary_operator::EQUAL: return this->equal(x, y);
-      case binary_operator::NOT_EQUAL: return this->not_equal(x, y);
-      case binary_operator::LESS: return this->less(x, y);
-      case binary_operator::GREATER: return this->greater(x, y);
-      case binary_operator::LESS_EQUAL: return this->less_equal(x, y);
-      case binary_operator::GREATER_EQUAL: return this->greater_equal(x, y);
-      default: return Out{};
-    }
-  }
-  CUDA_DEVICE_CALLABLE Out equal(Lhs const& x, Rhs const& y) const
-  {
-    return static_cast<Out>(x == y);
-  }
-  CUDA_DEVICE_CALLABLE Out not_equal(Lhs const& x, Rhs const& y) const
-  {
-    return static_cast<Out>(x != y);
-  }
-  CUDA_DEVICE_CALLABLE Out less(Lhs const& x, Rhs const& y) const
-  {
-    return static_cast<Out>(x < y);
-  }
-  CUDA_DEVICE_CALLABLE Out greater(Lhs const& x, Rhs const& y) const
-  {
-    return static_cast<Out>(x > y);
-  }
-  CUDA_DEVICE_CALLABLE Out less_equal(Lhs const& x, Rhs const& y) const
-  {
-    return static_cast<Out>(x <= y);
-  }
-  CUDA_DEVICE_CALLABLE Out greater_equal(Lhs const& x, Rhs const& y) const
-  {
-    return static_cast<Out>(x >= y);
-  }
-};
-
-template <typename Lhs, typename Rhs, typename Out>
-struct apply_binop_scalar_lhs_rhs : apply_binop<Lhs, Rhs, Out> {
-  cudf::scalar_device_type_t<Rhs> scalar;
-  apply_binop_scalar_lhs_rhs(binary_operator op, cudf::scalar_device_type_t<Rhs> scalar)
-    : apply_binop<Lhs, Rhs, Out>(op), scalar(scalar)
-  {
-  }
-  CUDA_DEVICE_CALLABLE Out operator()(Lhs const& x) const
-  {
-    return apply_binop<Lhs, Rhs, Out>::operator()(x, scalar.value());
-  }
-};
-
-template <typename Lhs, typename Rhs, typename Out>
-struct apply_binop_scalar_rhs_lhs : apply_binop<Rhs, Lhs, Out> {
-  cudf::scalar_device_type_t<Rhs> scalar;
-  apply_binop_scalar_rhs_lhs(binary_operator op, cudf::scalar_device_type_t<Rhs> scalar)
-    : apply_binop<Rhs, Lhs, Out>(op), scalar(scalar)
+/**
+ * @brief Converts scalar to column_device_view with single element.
+ *
+ * @return pair with column_device_view and column containing any auxilary data to create
+ * column_view from scalar
+ */
+struct scalar_as_column_device_view {
+  using return_type = typename std::pair<decltype(column_device_view::create(column_view{})),
+                                         std::unique_ptr<column>>;
+  template <typename T, std::enable_if_t<(is_fixed_width<T>())>* = nullptr>
+  return_type operator()(scalar const& s,
+                         rmm::cuda_stream_view stream,
+                         rmm::mr::device_memory_resource* mr)
   {
+    auto h_scalar_type_view = static_cast<cudf::scalar_type_t<T>&>(const_cast<scalar&>(s));
+    auto col_v =
+      column_view(s.type(), 1, h_scalar_type_view.data(), (bitmask_type const*)s.validity_data());
+    return std::pair{column_device_view::create(col_v, stream), std::unique_ptr<column>(nullptr)};
   }
-  CUDA_DEVICE_CALLABLE Out operator()(Lhs const& x) const
+  template <typename T, std::enable_if_t<(!is_fixed_width<T>())>* = nullptr>
+  return_type operator()(scalar const&, rmm::cuda_stream_view, rmm::mr::device_memory_resource*)
   {
-    return apply_binop<Rhs, Lhs, Out>::operator()(scalar.value(), x);
+    CUDF_FAIL("Unsupported type");
   }
 };
+// specialization for cudf::string_view
+template <>
+scalar_as_column_device_view::return_type
+scalar_as_column_device_view::operator()<cudf::string_view>(scalar const& s,
+                                                            rmm::cuda_stream_view stream,
+                                                            rmm::mr::device_memory_resource* mr)
+{
+  using T                 = cudf::string_view;
+  auto h_scalar_type_view = static_cast<cudf::scalar_type_t<T>&>(const_cast<scalar&>(s));
+
+  // build offsets column from the string size
+  auto offsets_transformer_itr =
+    thrust::make_constant_iterator<size_type>(h_scalar_type_view.size());
+  auto offsets_column = strings::detail::make_offsets_child_column(
+    offsets_transformer_itr, offsets_transformer_itr + 1, stream, mr);
+
+  auto chars_column_v =
+    column_view(data_type{type_id::INT8}, h_scalar_type_view.size(), h_scalar_type_view.data());
+  // Construct string column_view
+  auto col_v = column_view(s.type(),
+                           1,
+                           nullptr,
+                           (bitmask_type const*)s.validity_data(),
+                           cudf::UNKNOWN_NULL_COUNT,
+                           0,
+                           {offsets_column->view(), chars_column_v});
+  return std::pair{column_device_view::create(col_v, stream), std::move(offsets_column)};
+}
 
-template <typename Lhs, typename Rhs, typename Out>
-struct binary_op {
-  std::unique_ptr<column> operator()(column_view const& lhs,
-                                     scalar const& rhs,
-                                     binary_operator op,
-                                     data_type out_type,
-                                     bool const reversed,
-                                     rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
-  {
-    auto new_mask = binops::detail::scalar_col_valid_mask_and(lhs, rhs, stream, mr);
-    auto out      = make_fixed_width_column(out_type,
-                                       lhs.size(),
-                                       std::move(new_mask),
-                                       rhs.is_valid(stream) ? cudf::UNKNOWN_NULL_COUNT : lhs.size(),
-                                       stream,
-                                       mr);
-
-    if (lhs.size() > 0 && rhs.is_valid(stream)) {
-      auto out_view        = out->mutable_view();
-      auto out_itr         = out_view.begin<Out>();
-      auto lhs_device_view = column_device_view::create(lhs, stream);
-      using rhs_type       = cudf::scalar_type_t<Rhs>;
-      auto rhs_scalar      = rhs_type(static_cast<rhs_type const&>(rhs), stream);
-      auto rhs_scalar_view = get_scalar_device_view(rhs_scalar);
-      if (lhs.has_nulls()) {
-        auto lhs_itr = cudf::detail::make_null_replacement_iterator(*lhs_device_view, Lhs{});
-        reversed
-          ? thrust::transform(rmm::exec_policy(stream),
-                              lhs_itr,
-                              lhs_itr + lhs.size(),
-                              out_itr,
-                              apply_binop_scalar_rhs_lhs<Lhs, Rhs, Out>{op, rhs_scalar_view})
-          : thrust::transform(rmm::exec_policy(stream),
-                              lhs_itr,
-                              lhs_itr + lhs.size(),
-                              out_itr,
-                              apply_binop_scalar_lhs_rhs<Lhs, Rhs, Out>{op, rhs_scalar_view});
-      } else {
-        auto lhs_itr = thrust::make_transform_iterator(
-          thrust::make_counting_iterator(size_type{0}),
-          [col = *lhs_device_view] __device__(size_type i) { return col.element<Lhs>(i); });
-        reversed
-          ? thrust::transform(rmm::exec_policy(stream),
-                              lhs_itr,
-                              lhs_itr + lhs.size(),
-                              out_itr,
-                              apply_binop_scalar_rhs_lhs<Lhs, Rhs, Out>{op, rhs_scalar_view})
-          : thrust::transform(rmm::exec_policy(stream),
-                              lhs_itr,
-                              lhs_itr + lhs.size(),
-                              out_itr,
-                              apply_binop_scalar_lhs_rhs<Lhs, Rhs, Out>{op, rhs_scalar_view});
-      }
-    }
-
-    CHECK_CUDA(stream.value());
-
-    return out;
-  }
-
-  std::unique_ptr<column> operator()(column_view const& lhs,
-                                     column_view const& rhs,
-                                     binary_operator op,
-                                     data_type out_type,
-                                     rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
-  {
-    auto new_mask = cudf::detail::bitmask_and(table_view({lhs, rhs}), stream, mr);
-    auto out      = make_fixed_width_column(
-      out_type, lhs.size(), std::move(new_mask), cudf::UNKNOWN_NULL_COUNT, stream, mr);
-
-    if (lhs.size() > 0) {
-      auto out_view        = out->mutable_view();
-      auto out_itr         = out_view.begin<Out>();
-      auto lhs_device_view = column_device_view::create(lhs, stream);
-      auto rhs_device_view = column_device_view::create(rhs, stream);
-      if (lhs.has_nulls() && rhs.has_nulls()) {
-        auto lhs_itr = cudf::detail::make_null_replacement_iterator(*lhs_device_view, Lhs{});
-        auto rhs_itr = cudf::detail::make_null_replacement_iterator(*rhs_device_view, Rhs{});
-        thrust::transform(rmm::exec_policy(stream),
-                          lhs_itr,
-                          lhs_itr + lhs.size(),
-                          rhs_itr,
-                          out_itr,
-                          apply_binop<Lhs, Rhs, Out>{op});
-      } else if (lhs.has_nulls()) {
-        auto lhs_itr = cudf::detail::make_null_replacement_iterator(*lhs_device_view, Lhs{});
-        auto rhs_itr = thrust::make_transform_iterator(
-          thrust::make_counting_iterator(size_type{0}),
-          [col = *rhs_device_view] __device__(size_type i) { return col.element<Rhs>(i); });
-        thrust::transform(rmm::exec_policy(stream),
-                          lhs_itr,
-                          lhs_itr + lhs.size(),
-                          rhs_itr,
-                          out_itr,
-                          apply_binop<Lhs, Rhs, Out>{op});
-      } else if (rhs.has_nulls()) {
-        auto lhs_itr = thrust::make_transform_iterator(
-          thrust::make_counting_iterator(size_type{0}),
-          [col = *lhs_device_view] __device__(size_type i) { return col.element<Lhs>(i); });
-        auto rhs_itr = cudf::detail::make_null_replacement_iterator(*rhs_device_view, Rhs{});
-        thrust::transform(rmm::exec_policy(stream),
-                          lhs_itr,
-                          lhs_itr + lhs.size(),
-                          rhs_itr,
-                          out_itr,
-                          apply_binop<Lhs, Rhs, Out>{op});
-      } else {
-        auto lhs_itr = thrust::make_transform_iterator(
-          thrust::make_counting_iterator(size_type{0}),
-          [col = *lhs_device_view] __device__(size_type i) { return col.element<Lhs>(i); });
-        auto rhs_itr = thrust::make_transform_iterator(
-          thrust::make_counting_iterator(size_type{0}),
-          [col = *rhs_device_view] __device__(size_type i) { return col.element<Rhs>(i); });
-        thrust::transform(rmm::exec_policy(stream),
-                          lhs_itr,
-                          lhs_itr + lhs.size(),
-                          rhs_itr,
-                          out_itr,
-                          apply_binop<Lhs, Rhs, Out>{op});
-      }
-    }
-
-    CHECK_CUDA(stream.value());
-
-    return out;
-  }
-};
+/**
+ * @brief Converts scalar to column_device_view with single element.
+ *
+ * @param scal    scalar to convert
+ * @param stream  CUDA stream used for device memory operations and kernel launches.
+ * @param mr      Device memory resource used to allocate the returned column's device memory
+ * @return        pair with column_device_view and column containing any auxilary data to create
+ * column_view from scalar
+ */
+auto scalar_to_column_device_view(
+  scalar const& scal,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+{
+  return type_dispatcher(scal.type(), scalar_as_column_device_view{}, scal, stream, mr);
+}
 
 // This functor does the actual comparison between string column value and a scalar string
 // or between two string column values using a comparator
@@ -337,152 +208,181 @@ struct null_considering_binop {
     // Create device views for inputs
     auto const lhs_dev_view = get_device_view(lhs);
     auto const rhs_dev_view = get_device_view(rhs);
-
-    switch (op) {
-      case binary_operator::NULL_EQUALS: {
-        // Validate input
-        CUDF_EXPECTS(output_type.id() == type_id::BOOL8, "Output column type has to be bool");
-
-        // Make a bool8 numeric output column
-        out = make_numeric_column(
-          data_type{type_id::BOOL8}, col_size, mask_state::ALL_VALID, stream, mr);
-
-        // Create a compare function lambda
-        auto equal_func = [] __device__(bool lhs_valid,
-                                        bool rhs_valid,
-                                        cudf::string_view lhs_value,
-                                        cudf::string_view rhs_value) {
-          if (!lhs_valid && !rhs_valid) return true;
-          if (lhs_valid && rhs_valid) return (lhs_value == rhs_value);
-          return false;
-        };
-
-        // Populate output column
-        populate_out_col(lhs_dev_view,
-                         rhs_dev_view,
-                         col_size,
-                         stream,
-                         equal_func,
-                         mutable_column_view{*out}.begin<bool>());
-
-        break;
-      }
-
-      case binary_operator::NULL_MAX:
-      case binary_operator::NULL_MIN: {
-        // Validate input
-        CUDF_EXPECTS(output_type.id() == lhs.type().id(),
-                     "Output column type should match input column type");
-
-        // Shallow copy of the resultant strings
-        rmm::device_uvector<cudf::string_view> out_col_strings(col_size, stream);
-
-        // Invalid output column strings - null rows
-        cudf::string_view const invalid_str{nullptr, 0};
-
-        // Create a compare function lambda
-        auto minmax_func = [op, invalid_str] __device__(bool lhs_valid,
-                                                        bool rhs_valid,
-                                                        cudf::string_view lhs_value,
-                                                        cudf::string_view rhs_value) {
-          if (!lhs_valid && !rhs_valid)
-            return invalid_str;
-          else if (lhs_valid && rhs_valid) {
-            return (op == binary_operator::NULL_MAX)
-                     ? thrust::maximum<cudf::string_view>()(lhs_value, rhs_value)
-                     : thrust::minimum<cudf::string_view>()(lhs_value, rhs_value);
-          } else if (lhs_valid)
-            return lhs_value;
-          else
-            return rhs_value;
-        };
-
-        // Populate output column
-        populate_out_col(
-          lhs_dev_view, rhs_dev_view, col_size, stream, minmax_func, out_col_strings.data());
-
-        // Create an output column with the resultant strings
-        out = cudf::make_strings_column(out_col_strings, invalid_str, stream, mr);
-
-        break;
-      }
-
-      default: {
-        CUDF_FAIL("Null aware binop not supported");
-      }
-    }
-
-    return out;
+    // Validate input
+    CUDF_EXPECTS(output_type.id() == lhs.type().id(),
+                 "Output column type should match input column type");
+
+    // Shallow copy of the resultant strings
+    rmm::device_uvector<cudf::string_view> out_col_strings(col_size, stream);
+
+    // Invalid output column strings - null rows
+    cudf::string_view const invalid_str{nullptr, 0};
+
+    // Create a compare function lambda
+    auto minmax_func =
+      [op, invalid_str] __device__(
+        bool lhs_valid, bool rhs_valid, cudf::string_view lhs_value, cudf::string_view rhs_value) {
+        if (!lhs_valid && !rhs_valid)
+          return invalid_str;
+        else if (lhs_valid && rhs_valid) {
+          return (op == binary_operator::NULL_MAX)
+                   ? thrust::maximum<cudf::string_view>()(lhs_value, rhs_value)
+                   : thrust::minimum<cudf::string_view>()(lhs_value, rhs_value);
+        } else if (lhs_valid)
+          return lhs_value;
+        else
+          return rhs_value;
+      };
+
+    // Populate output column
+    populate_out_col(
+      lhs_dev_view, rhs_dev_view, col_size, stream, minmax_func, out_col_strings.data());
+
+    // Create an output column with the resultant strings
+    return cudf::make_strings_column(out_col_strings, invalid_str, stream, mr);
   }
 };
 
 }  // namespace
 
-std::unique_ptr<column> binary_operation(scalar const& lhs,
-                                         column_view const& rhs,
-                                         binary_operator op,
-                                         data_type output_type,
-                                         rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> string_null_min_max(scalar const& lhs,
+                                            column_view const& rhs,
+                                            binary_operator op,
+                                            data_type output_type,
+                                            rmm::cuda_stream_view stream,
+                                            rmm::mr::device_memory_resource* mr)
 {
   // hard-coded to only work with cudf::string_view so we don't explode compile times
   CUDF_EXPECTS(lhs.type().id() == cudf::type_id::STRING, "Invalid/Unsupported lhs datatype");
   CUDF_EXPECTS(rhs.type().id() == cudf::type_id::STRING, "Invalid/Unsupported rhs datatype");
-  if (is_null_dependent(op)) {
-    if (rhs.is_empty()) return cudf::make_empty_column(output_type);
-    auto rhs_device_view = cudf::column_device_view::create(rhs, stream);
-    return null_considering_binop{}(lhs, *rhs_device_view, op, output_type, rhs.size(), stream, mr);
-  } else {
-    CUDF_EXPECTS(is_boolean(output_type), "Invalid/Unsupported output datatype");
-    // Should pass the right type of scalar and column_view when specializing binary_op
-    return binary_op<cudf::string_view, cudf::string_view, bool>{}(
-      rhs, lhs, op, output_type, true, stream, mr);
-  }
+  CUDF_EXPECTS(op == binary_operator::NULL_MAX or op == binary_operator::NULL_MIN,
+               "Unsupported binary operation");
+  if (rhs.is_empty()) return cudf::make_empty_column(output_type);
+  auto rhs_device_view = cudf::column_device_view::create(rhs, stream);
+  return null_considering_binop{}(lhs, *rhs_device_view, op, output_type, rhs.size(), stream, mr);
 }
 
-std::unique_ptr<column> binary_operation(column_view const& lhs,
-                                         scalar const& rhs,
-                                         binary_operator op,
-                                         data_type output_type,
-                                         rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> string_null_min_max(column_view const& lhs,
+                                            scalar const& rhs,
+                                            binary_operator op,
+                                            data_type output_type,
+                                            rmm::cuda_stream_view stream,
+                                            rmm::mr::device_memory_resource* mr)
 {
   // hard-coded to only work with cudf::string_view so we don't explode compile times
   CUDF_EXPECTS(lhs.type().id() == cudf::type_id::STRING, "Invalid/Unsupported lhs datatype");
   CUDF_EXPECTS(rhs.type().id() == cudf::type_id::STRING, "Invalid/Unsupported rhs datatype");
-  if (is_null_dependent(op)) {
-    if (lhs.is_empty()) return cudf::make_empty_column(output_type);
-    auto lhs_device_view = cudf::column_device_view::create(lhs, stream);
-    return null_considering_binop{}(*lhs_device_view, rhs, op, output_type, lhs.size(), stream, mr);
-  } else {
-    CUDF_EXPECTS(is_boolean(output_type), "Invalid/Unsupported output datatype");
-    return binary_op<cudf::string_view, cudf::string_view, bool>{}(
-      lhs, rhs, op, output_type, false, stream, mr);
-  }
+  CUDF_EXPECTS(op == binary_operator::NULL_MAX or op == binary_operator::NULL_MIN,
+               "Unsupported binary operation");
+  if (lhs.is_empty()) return cudf::make_empty_column(output_type);
+  auto lhs_device_view = cudf::column_device_view::create(lhs, stream);
+  return null_considering_binop{}(*lhs_device_view, rhs, op, output_type, lhs.size(), stream, mr);
 }
 
-std::unique_ptr<column> binary_operation(column_view const& lhs,
-                                         column_view const& rhs,
-                                         binary_operator op,
-                                         data_type output_type,
-                                         rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> string_null_min_max(column_view const& lhs,
+                                            column_view const& rhs,
+                                            binary_operator op,
+                                            data_type output_type,
+                                            rmm::cuda_stream_view stream,
+                                            rmm::mr::device_memory_resource* mr)
 {
   // hard-coded to only work with cudf::string_view so we don't explode compile times
   CUDF_EXPECTS(lhs.type().id() == cudf::type_id::STRING, "Invalid/Unsupported lhs datatype");
   CUDF_EXPECTS(rhs.type().id() == cudf::type_id::STRING, "Invalid/Unsupported rhs datatype");
-  if (is_null_dependent(op)) {
-    CUDF_EXPECTS(lhs.size() == rhs.size(), "Column sizes do not match");
-    if (lhs.is_empty()) return cudf::make_empty_column(output_type);
-    auto lhs_device_view = cudf::column_device_view::create(lhs, stream);
-    auto rhs_device_view = cudf::column_device_view::create(rhs, stream);
-    return null_considering_binop{}(
-      *lhs_device_view, *rhs_device_view, op, output_type, lhs.size(), stream, mr);
-  } else {
-    CUDF_EXPECTS(is_boolean(output_type), "Invalid/Unsupported output datatype");
-    return binary_op<cudf::string_view, cudf::string_view, bool>{}(
-      lhs, rhs, op, output_type, stream, mr);
-  }
+  CUDF_EXPECTS(op == binary_operator::NULL_MAX or op == binary_operator::NULL_MIN,
+               "Unsupported binary operation");
+  CUDF_EXPECTS(lhs.size() == rhs.size(), "Column sizes do not match");
+  if (lhs.is_empty()) return cudf::make_empty_column(output_type);
+  auto lhs_device_view = cudf::column_device_view::create(lhs, stream);
+  auto rhs_device_view = cudf::column_device_view::create(rhs, stream);
+  return null_considering_binop{}(
+    *lhs_device_view, *rhs_device_view, op, output_type, lhs.size(), stream, mr);
+}
+
+void operator_dispatcher(mutable_column_device_view& out,
+                         column_device_view const& lhs,
+                         column_device_view const& rhs,
+                         bool is_lhs_scalar,
+                         bool is_rhs_scalar,
+                         binary_operator op,
+                         rmm::cuda_stream_view stream)
+{
+  // clang-format off
+switch (op) {
+case binary_operator::ADD:                  apply_binary_op<ops::Add>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
+case binary_operator::SUB:                  apply_binary_op<ops::Sub>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
+case binary_operator::MUL:                  apply_binary_op<ops::Mul>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
+case binary_operator::DIV:                  apply_binary_op<ops::Div>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
+case binary_operator::TRUE_DIV:             apply_binary_op<ops::TrueDiv>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
+case binary_operator::FLOOR_DIV:            apply_binary_op<ops::FloorDiv>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
+case binary_operator::MOD:                  apply_binary_op<ops::Mod>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
+case binary_operator::PYMOD:                apply_binary_op<ops::PyMod>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
+case binary_operator::POW:                  apply_binary_op<ops::Pow>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
+case binary_operator::EQUAL:
+case binary_operator::NOT_EQUAL:
+case binary_operator::NULL_EQUALS:
+if(out.type().id() != type_id::BOOL8) CUDF_FAIL("Output type of Comparison operator should be bool type");
+dispatch_equality_op(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, op, stream); break;
+case binary_operator::LESS:                 apply_binary_op<ops::Less>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
+case binary_operator::GREATER:              apply_binary_op<ops::Greater>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
+case binary_operator::LESS_EQUAL:           apply_binary_op<ops::LessEqual>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
+case binary_operator::GREATER_EQUAL:        apply_binary_op<ops::GreaterEqual>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
+case binary_operator::BITWISE_AND:          apply_binary_op<ops::BitwiseAnd>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
+case binary_operator::BITWISE_OR:           apply_binary_op<ops::BitwiseOr>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
+case binary_operator::BITWISE_XOR:          apply_binary_op<ops::BitwiseXor>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
+case binary_operator::LOGICAL_AND:          apply_binary_op<ops::LogicalAnd>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
+case binary_operator::LOGICAL_OR:           apply_binary_op<ops::LogicalOr>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
+/*
+case binary_operator::GENERIC_BINARY:      // Cannot be compiled, should be called by jit::binary_operation
+*/
+case binary_operator::SHIFT_LEFT:           apply_binary_op<ops::ShiftLeft>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
+case binary_operator::SHIFT_RIGHT:          apply_binary_op<ops::ShiftRight>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
+case binary_operator::SHIFT_RIGHT_UNSIGNED: apply_binary_op<ops::ShiftRightUnsigned>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
+case binary_operator::LOG_BASE:             apply_binary_op<ops::LogBase>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
+case binary_operator::ATAN2:                apply_binary_op<ops::ATan2>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
+case binary_operator::PMOD:                 apply_binary_op<ops::PMod>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
+case binary_operator::NULL_MAX:             apply_binary_op<ops::NullMax>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
+case binary_operator::NULL_MIN:             apply_binary_op<ops::NullMin>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
+default:;
+}
+  // clang-format on
+}
+
+// vector_vector
+void binary_operation(mutable_column_view& out,
+                      column_view const& lhs,
+                      column_view const& rhs,
+                      binary_operator op,
+                      rmm::cuda_stream_view stream)
+{
+  auto lhsd = column_device_view::create(lhs, stream);
+  auto rhsd = column_device_view::create(rhs, stream);
+  auto outd = mutable_column_device_view::create(out, stream);
+  operator_dispatcher(*outd, *lhsd, *rhsd, false, false, op, stream);
+}
+// scalar_vector
+void binary_operation(mutable_column_view& out,
+                      scalar const& lhs,
+                      column_view const& rhs,
+                      binary_operator op,
+                      rmm::cuda_stream_view stream)
+{
+  auto [lhsd, aux] = scalar_to_column_device_view(lhs, stream);
+  auto rhsd        = column_device_view::create(rhs, stream);
+  auto outd        = mutable_column_device_view::create(out, stream);
+  operator_dispatcher(*outd, *lhsd, *rhsd, true, false, op, stream);
+}
+// vector_scalar
+void binary_operation(mutable_column_view& out,
+                      column_view const& lhs,
+                      scalar const& rhs,
+                      binary_operator op,
+                      rmm::cuda_stream_view stream)
+{
+  auto lhsd        = column_device_view::create(lhs, stream);
+  auto [rhsd, aux] = scalar_to_column_device_view(rhs, stream);
+  auto outd        = mutable_column_device_view::create(out, stream);
+  operator_dispatcher(*outd, *lhsd, *rhsd, false, true, op, stream);
 }
 
 }  // namespace compiled
diff --git a/cpp/src/binaryop/compiled/binary_ops.cuh b/cpp/src/binaryop/compiled/binary_ops.cuh
new file mode 100644
index 00000000000..b17f3eddc5d
--- /dev/null
+++ b/cpp/src/binaryop/compiled/binary_ops.cuh
@@ -0,0 +1,272 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "binary_ops.hpp"
+#include "operation.cuh"
+
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/detail/utilities/integer_utils.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+
+namespace cudf {
+namespace binops {
+namespace compiled {
+
+template <typename BinaryOperator, typename TypeLhs, typename TypeRhs>
+constexpr bool is_bool_result()
+{
+  using ReturnType = std::invoke_result_t<BinaryOperator, TypeLhs, TypeRhs>;
+  return std::is_same_v<bool, ReturnType>;
+}
+
+/**
+ * @brief Type casts each element of the column to `CastType`
+ *
+ */
+template <typename CastType>
+struct type_casted_accessor {
+  template <typename Element>
+  CUDA_DEVICE_CALLABLE CastType operator()(cudf::size_type i,
+                                           column_device_view const& col,
+                                           bool is_scalar) const
+  {
+    if constexpr (column_device_view::has_element_accessor<Element>() and
+                  std::is_convertible_v<Element, CastType>)
+      return static_cast<CastType>(col.element<Element>(is_scalar ? 0 : i));
+    return {};
+  }
+};
+
+/**
+ * @brief Type casts value to column type and stores in `i`th row of the column
+ *
+ */
+template <typename FromType>
+struct typed_casted_writer {
+  template <typename Element>
+  CUDA_DEVICE_CALLABLE void operator()(cudf::size_type i,
+                                       mutable_column_device_view const& col,
+                                       FromType val) const
+  {
+    if constexpr (mutable_column_device_view::has_element_accessor<Element>() and
+                  std::is_constructible_v<Element, FromType>) {
+      col.element<Element>(i) = static_cast<Element>(val);
+    } else if constexpr (is_fixed_point<Element>() and std::is_constructible_v<Element, FromType>) {
+      if constexpr (is_fixed_point<FromType>())
+        col.data<Element::rep>()[i] = val.rescaled(numeric::scale_type{col.type().scale()}).value();
+      else
+        col.data<Element::rep>()[i] = Element{val, numeric::scale_type{col.type().scale()}}.value();
+    }
+  }
+};
+
+// Functors to launch only defined operations.
+
+/**
+ * @brief Functor to launch only defined operations with common type.
+ *
+ * @tparam BinaryOperator binary operator functor
+ */
+template <typename BinaryOperator>
+struct ops_wrapper {
+  mutable_column_device_view& out;
+  column_device_view const& lhs;
+  column_device_view const& rhs;
+  bool const& is_lhs_scalar;
+  bool const& is_rhs_scalar;
+  template <typename TypeCommon>
+  __device__ void operator()(size_type i)
+  {
+    if constexpr (std::is_invocable_v<BinaryOperator, TypeCommon, TypeCommon>) {
+      TypeCommon x =
+        type_dispatcher(lhs.type(), type_casted_accessor<TypeCommon>{}, i, lhs, is_lhs_scalar);
+      TypeCommon y =
+        type_dispatcher(rhs.type(), type_casted_accessor<TypeCommon>{}, i, rhs, is_rhs_scalar);
+      auto result = [&]() {
+        if constexpr (std::is_same_v<BinaryOperator, ops::NullEquals> or
+                      std::is_same_v<BinaryOperator, ops::NullMax> or
+                      std::is_same_v<BinaryOperator, ops::NullMin>) {
+          bool output_valid = false;
+          auto result       = BinaryOperator{}.template operator()<TypeCommon, TypeCommon>(
+            x,
+            y,
+            lhs.is_valid(is_lhs_scalar ? 0 : i),
+            rhs.is_valid(is_rhs_scalar ? 0 : i),
+            output_valid);
+          if (out.nullable() && !output_valid) out.set_null(i);
+          return result;
+        } else {
+          return BinaryOperator{}.template operator()<TypeCommon, TypeCommon>(x, y);
+        }
+        // To supress nvcc warning
+        return std::invoke_result_t<BinaryOperator, TypeCommon, TypeCommon>{};
+      }();
+      if constexpr (is_bool_result<BinaryOperator, TypeCommon, TypeCommon>())
+        out.element<decltype(result)>(i) = result;
+      else
+        type_dispatcher(out.type(), typed_casted_writer<decltype(result)>{}, i, out, result);
+    }
+    (void)i;
+  }
+};
+
+/**
+ * @brief Functor to launch only defined operations without common type.
+ *
+ * @tparam BinaryOperator binary operator functor
+ */
+template <typename BinaryOperator>
+struct ops2_wrapper {
+  mutable_column_device_view& out;
+  column_device_view const& lhs;
+  column_device_view const& rhs;
+  bool const& is_lhs_scalar;
+  bool const& is_rhs_scalar;
+  template <typename TypeLhs, typename TypeRhs>
+  __device__ void operator()(size_type i)
+  {
+    if constexpr (!has_common_type_v<TypeLhs, TypeRhs> and
+                  std::is_invocable_v<BinaryOperator, TypeLhs, TypeRhs>) {
+      TypeLhs x   = lhs.element<TypeLhs>(is_lhs_scalar ? 0 : i);
+      TypeRhs y   = rhs.element<TypeRhs>(is_rhs_scalar ? 0 : i);
+      auto result = [&]() {
+        if constexpr (std::is_same_v<BinaryOperator, ops::NullEquals> or
+                      std::is_same_v<BinaryOperator, ops::NullMax> or
+                      std::is_same_v<BinaryOperator, ops::NullMin>) {
+          bool output_valid = false;
+          auto result       = BinaryOperator{}.template operator()<TypeLhs, TypeRhs>(
+            x,
+            y,
+            lhs.is_valid(is_lhs_scalar ? 0 : i),
+            rhs.is_valid(is_rhs_scalar ? 0 : i),
+            output_valid);
+          if (out.nullable() && !output_valid) out.set_null(i);
+          return result;
+        } else {
+          return BinaryOperator{}.template operator()<TypeLhs, TypeRhs>(x, y);
+        }
+        // To supress nvcc warning
+        return std::invoke_result_t<BinaryOperator, TypeLhs, TypeRhs>{};
+      }();
+      if constexpr (is_bool_result<BinaryOperator, TypeLhs, TypeRhs>())
+        out.element<decltype(result)>(i) = result;
+      else
+        type_dispatcher(out.type(), typed_casted_writer<decltype(result)>{}, i, out, result);
+    }
+    (void)i;
+  }
+};
+
+/**
+ * @brief Functor which does single, and double type dispatcher in device code
+ *
+ * single type dispatcher for lhs and rhs with common types.
+ * double type dispatcher for lhs and rhs without common types.
+ *
+ * @tparam BinaryOperator binary operator functor
+ */
+template <class BinaryOperator>
+struct device_type_dispatcher {
+  mutable_column_device_view out;
+  column_device_view lhs;
+  column_device_view rhs;
+  bool is_lhs_scalar;
+  bool is_rhs_scalar;
+  std::optional<data_type> common_data_type;
+
+  __device__ void operator()(size_type i)
+  {
+    if (common_data_type) {
+      type_dispatcher(*common_data_type,
+                      ops_wrapper<BinaryOperator>{out, lhs, rhs, is_lhs_scalar, is_rhs_scalar},
+                      i);
+    } else {
+      double_type_dispatcher(
+        lhs.type(),
+        rhs.type(),
+        ops2_wrapper<BinaryOperator>{out, lhs, rhs, is_lhs_scalar, is_rhs_scalar},
+        i);
+    }
+  }
+};
+
+/**
+ * @brief Simplified for_each kernel
+ *
+ * @param size number of elements to process.
+ * @param f Functor object to call for each element.
+ */
+template <typename Functor>
+__global__ void for_each_kernel(cudf::size_type size, Functor f)
+{
+  int tid    = threadIdx.x;
+  int blkid  = blockIdx.x;
+  int blksz  = blockDim.x;
+  int gridsz = gridDim.x;
+
+  int start = tid + blkid * blksz;
+  int step  = blksz * gridsz;
+
+#pragma unroll
+  for (cudf::size_type i = start; i < size; i += step) {
+    f(i);
+  }
+}
+
+/**
+ * @brief Launches Simplified for_each kernel with maximum occupancy grid dimensions.
+ *
+ * @tparam Functor
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param size number of elements to process.
+ * @param f Functor object to call for each element.
+ */
+template <typename Functor>
+void for_each(rmm::cuda_stream_view stream, cudf::size_type size, Functor f)
+{
+  int block_size;
+  int min_grid_size;
+  CUDA_TRY(
+    cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, for_each_kernel<decltype(f)>));
+  // 2 elements per thread.
+  const int grid_size = util::div_rounding_up_safe(size, 2 * block_size);
+  for_each_kernel<<<grid_size, block_size, 0, stream.value()>>>(size, std::forward<Functor&&>(f));
+}
+
+template <class BinaryOperator>
+void apply_binary_op(mutable_column_device_view& outd,
+                     column_device_view const& lhsd,
+                     column_device_view const& rhsd,
+                     bool is_lhs_scalar,
+                     bool is_rhs_scalar,
+                     rmm::cuda_stream_view stream)
+{
+  auto common_dtype = get_common_type(outd.type(), lhsd.type(), rhsd.type());
+
+  // Create binop functor instance
+  auto binop_func = device_type_dispatcher<BinaryOperator>{
+    outd, lhsd, rhsd, is_lhs_scalar, is_rhs_scalar, common_dtype};
+  // Execute it on every element
+  for_each(stream, outd.size(), binop_func);
+}
+
+}  // namespace compiled
+}  // namespace binops
+}  // namespace cudf
diff --git a/cpp/src/binaryop/compiled/binary_ops.hpp b/cpp/src/binaryop/compiled/binary_ops.hpp
index a3f62f5018e..2a814c16d57 100644
--- a/cpp/src/binaryop/compiled/binary_ops.hpp
+++ b/cpp/src/binaryop/compiled/binary_ops.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,7 +21,13 @@
 
 #include <rmm/cuda_stream_view.hpp>
 
+#include <optional>
+
 namespace cudf {
+// Forward declarations
+class column_device_view;
+class mutable_column_device_view;
+
 namespace binops {
 namespace detail {
 /**
@@ -45,6 +51,30 @@ inline bool is_null_dependent(binary_operator op)
 
 namespace compiled {
 
+std::unique_ptr<column> string_null_min_max(
+  scalar const& lhs,
+  column_view const& rhs,
+  binary_operator op,
+  data_type output_type,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+std::unique_ptr<column> string_null_min_max(
+  column_view const& lhs,
+  scalar const& rhs,
+  binary_operator op,
+  data_type output_type,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+std::unique_ptr<column> string_null_min_max(
+  column_view const& lhs,
+  column_view const& rhs,
+  binary_operator op,
+  data_type output_type,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /**
  * @brief Performs a binary operation between a string scalar and a string
  * column.
@@ -123,6 +153,89 @@ std::unique_ptr<column> binary_operation(
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+void binary_operation(mutable_column_view& out,
+                      scalar const& lhs,
+                      column_view const& rhs,
+                      binary_operator op,
+                      rmm::cuda_stream_view stream);
+void binary_operation(mutable_column_view& out,
+                      column_view const& lhs,
+                      scalar const& rhs,
+                      binary_operator op,
+                      rmm::cuda_stream_view stream);
+void binary_operation(mutable_column_view& out,
+                      column_view const& lhs,
+                      column_view const& rhs,
+                      binary_operator op,
+                      rmm::cuda_stream_view stream);
+
+// Defined in util.cpp
+/**
+ * @brief Get the common type among all input types.
+ *
+ * @param out type 1
+ * @param lhs type 2
+ * @param rhs type 3
+ * @return common type among @p out, @p lhs, @p rhs.
+ */
+std::optional<data_type> get_common_type(data_type out, data_type lhs, data_type rhs);
+/**
+ * @brief Check if input binary operation is supported for the given input and output types.
+ *
+ * @param out output type of the binary operation
+ * @param lhs first operand type of the binary operation
+ * @param rhs second operand type of the binary operation
+ * @param op binary operator enum.
+ * @return true if given binary operator supports given input and output types.
+ */
+bool is_supported_operation(data_type out, data_type lhs, data_type rhs, binary_operator op);
+
+// Defined in individual .cu files.
+/**
+ * @brief Deploys single type or double type dispatcher that runs binary operation on each element
+ * of @p lhsd and @p rhsd columns.
+ *
+ * This template is instantiated for each binary operator.
+ *
+ * @tparam BinaryOperator Binary operator functor
+ * @param outd mutable device view of output column
+ * @param lhsd device view of left operand column
+ * @param rhsd device view of right operand column
+ * @param is_lhs_scalar true if @p lhsd is a single element column representing a scalar
+ * @param is_rhs_scalar true if @p rhsd is a single element column representing a scalar
+ * @param stream CUDA stream used for device memory operations
+ */
+template <class BinaryOperator>
+void apply_binary_op(mutable_column_device_view&,
+                     column_device_view const&,
+                     column_device_view const&,
+                     bool is_lhs_scalar,
+                     bool is_rhs_scalar,
+                     rmm::cuda_stream_view stream);
+/**
+ * @brief Deploys single type or double type dispatcher that runs equality operation on each element
+ * of @p lhsd and @p rhsd columns.
+ *
+ * Comparison operators are EQUAL, NOT_EQUAL, NULL_EQUALS.
+ * @p outd type is boolean.
+ *
+ * This template is instantiated for each binary operator.
+ *
+ * @param outd mutable device view of output column
+ * @param lhsd device view of left operand column
+ * @param rhsd device view of right operand column
+ * @param is_lhs_scalar true if @p lhsd is a single element column representing a scalar
+ * @param is_rhs_scalar true if @p rhsd is a single element column representing a scalar
+ * @param op comparison binary operator
+ * @param stream CUDA stream used for device memory operations
+ */
+void dispatch_equality_op(mutable_column_device_view& outd,
+                          column_device_view const& lhsd,
+                          column_device_view const& rhsd,
+                          bool is_lhs_scalar,
+                          bool is_rhs_scalar,
+                          binary_operator op,
+                          rmm::cuda_stream_view stream);
 }  // namespace compiled
 }  // namespace binops
 }  // namespace cudf
diff --git a/cpp/src/binaryop/compiled/equality_ops.cu b/cpp/src/binaryop/compiled/equality_ops.cu
new file mode 100644
index 00000000000..feee310716a
--- /dev/null
+++ b/cpp/src/binaryop/compiled/equality_ops.cu
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "binary_ops.cuh"
+
+namespace cudf::binops::compiled {
+void dispatch_equality_op(mutable_column_device_view& outd,
+                          column_device_view const& lhsd,
+                          column_device_view const& rhsd,
+                          bool is_lhs_scalar,
+                          bool is_rhs_scalar,
+                          binary_operator op,
+                          rmm::cuda_stream_view stream)
+{
+  auto common_dtype = get_common_type(outd.type(), lhsd.type(), rhsd.type());
+
+  // Execute it on every element
+  for_each(
+    stream,
+    outd.size(),
+    [op, outd, lhsd, rhsd, is_lhs_scalar, is_rhs_scalar, common_dtype] __device__(size_type i) {
+      // clang-format off
+      // Similar enabled template types should go together (better performance)
+      switch (op) {
+      case binary_operator::EQUAL:         device_type_dispatcher<ops::Equal>{outd, lhsd, rhsd, is_lhs_scalar, is_rhs_scalar, common_dtype}(i); break;
+      case binary_operator::NOT_EQUAL:     device_type_dispatcher<ops::NotEqual>{outd, lhsd, rhsd, is_lhs_scalar, is_rhs_scalar, common_dtype}(i); break;
+      case binary_operator::NULL_EQUALS:   device_type_dispatcher<ops::NullEquals>{outd, lhsd, rhsd, is_lhs_scalar, is_rhs_scalar, common_dtype}(i); break;
+      default:;
+      }
+      // clang-format on
+    });
+}
+}  // namespace cudf::binops::compiled
diff --git a/cpp/src/binaryop/compiled/operation.cuh b/cpp/src/binaryop/compiled/operation.cuh
new file mode 100644
index 00000000000..86645e2cb8a
--- /dev/null
+++ b/cpp/src/binaryop/compiled/operation.cuh
@@ -0,0 +1,421 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/utilities/traits.hpp>
+
+#include <cmath>
+
+namespace cudf {
+namespace binops {
+namespace compiled {
+
+// All binary operations
+namespace ops {
+
+struct Add {
+  template <typename T1, typename T2>
+  CUDA_DEVICE_CALLABLE auto operator()(T1 const& lhs, T2 const& rhs) -> decltype(lhs + rhs)
+  {
+    return lhs + rhs;
+  }
+};
+
+struct Sub {
+  template <typename T1, typename T2>
+  CUDA_DEVICE_CALLABLE auto operator()(T1 const& lhs, T2 const& rhs) -> decltype(lhs - rhs)
+  {
+    return lhs - rhs;
+  }
+};
+
+struct Mul {
+  template <typename TypeLhs, typename TypeRhs>
+  static constexpr inline bool is_supported()
+  {
+    return has_common_type_v<TypeLhs, TypeRhs> or
+           // FIXME: without the following line, compilation error
+           // _deps/libcudacxx-src/include/cuda/std/detail/libcxx/include/chrono(917): error:
+           // identifier "cuda::std::__3::ratio<(long)86400000000l, (long)1l> ::num" is undefined in
+           // device code
+           (is_duration<TypeLhs>() and std::is_integral<TypeRhs>()) or
+           (std::is_integral<TypeLhs>() and is_duration<TypeRhs>()) or
+           (is_fixed_point<TypeLhs>() and is_numeric<TypeRhs>()) or
+           (is_numeric<TypeLhs>() and is_fixed_point<TypeRhs>());
+  }
+  template <typename T1, typename T2, std::enable_if_t<is_supported<T1, T2>()>* = nullptr>
+  CUDA_DEVICE_CALLABLE auto operator()(T1 const& lhs, T2 const& rhs) -> decltype(lhs * rhs)
+  {
+    return lhs * rhs;
+  }
+};
+
+struct Div {
+  template <typename TypeLhs, typename TypeRhs>
+  static constexpr inline bool is_supported()
+  {
+    return has_common_type_v<TypeLhs, TypeRhs> or
+           // FIXME: without this, compilation error on chrono:917
+           (is_duration<TypeLhs>() and (std::is_integral<TypeRhs>() or is_duration<TypeRhs>())) or
+           (is_fixed_point<TypeLhs>() and is_numeric<TypeRhs>()) or
+           (is_numeric<TypeLhs>() and is_fixed_point<TypeRhs>());
+  }
+  template <typename T1, typename T2, std::enable_if_t<is_supported<T1, T2>()>* = nullptr>
+  CUDA_DEVICE_CALLABLE auto operator()(T1 const& lhs, T2 const& rhs) -> decltype(lhs / rhs)
+  {
+    return lhs / rhs;
+  }
+};
+
+struct TrueDiv {
+  template <typename T1, typename T2>
+  CUDA_DEVICE_CALLABLE auto operator()(T1 const& lhs, T2 const& rhs)
+    -> decltype((static_cast<double>(lhs) / static_cast<double>(rhs)))
+  {
+    return (static_cast<double>(lhs) / static_cast<double>(rhs));
+  }
+};
+
+struct FloorDiv {
+  template <typename T1, typename T2>
+  CUDA_DEVICE_CALLABLE auto operator()(T1 const& lhs, T2 const& rhs)
+    -> decltype(floor(static_cast<double>(lhs) / static_cast<double>(rhs)))
+  {
+    return floor(static_cast<double>(lhs) / static_cast<double>(rhs));
+  }
+};
+
+struct Mod {
+  template <typename TypeLhs, typename TypeRhs>
+  static constexpr inline bool is_supported()
+  {
+    return has_common_type_v<TypeLhs, TypeRhs> or
+           // FIXME: without this, compilation error
+           //_deps/libcudacxx-src/include/cuda/std/detail/libcxx/include/chrono(1337):
+           // error : expression must have integral or unscoped enum type
+           (is_duration<TypeLhs>() and (std::is_integral<TypeRhs>() or is_duration<TypeRhs>()));
+  }
+  template <typename T1, typename T2, std::enable_if_t<is_supported<T1, T2>()>* = nullptr>
+  CUDA_DEVICE_CALLABLE auto operator()(T1 const& lhs, T2 const& rhs) -> decltype(lhs % rhs)
+  {
+    return lhs % rhs;
+  }
+  template <typename T1,
+            typename T2,
+            std::enable_if_t<(std::is_same_v<float, std::common_type_t<T1, T2>>)>* = nullptr>
+  CUDA_DEVICE_CALLABLE auto operator()(T1 const& lhs, T2 const& rhs) -> float
+  {
+    return fmodf(static_cast<float>(lhs), static_cast<float>(rhs));
+  }
+  template <typename T1,
+            typename T2,
+            std::enable_if_t<(std::is_same_v<double, std::common_type_t<T1, T2>>)>* = nullptr>
+  CUDA_DEVICE_CALLABLE auto operator()(T1 const& lhs, T2 const& rhs) -> double
+  {
+    return fmod(static_cast<double>(lhs), static_cast<double>(rhs));
+  }
+};
+
+struct PMod {
+  // Ideally, these two specializations - one for integral types and one for non integral
+  // types shouldn't be required, as std::fmod should promote integral types automatically
+  // to double and call the std::fmod overload for doubles. Sadly, doing this in jitified
+  // code does not work - it is having trouble deciding between float/double overloads
+  template <typename TypeLhs,
+            typename TypeRhs,
+            std::enable_if_t<(std::is_integral_v<std::common_type_t<TypeLhs, TypeRhs>>)>* = nullptr>
+  CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y)
+  {
+    using common_t = std::common_type_t<TypeLhs, TypeRhs>;
+    common_t xconv = static_cast<common_t>(x);
+    common_t yconv = static_cast<common_t>(y);
+    auto rem       = xconv % yconv;
+    if constexpr (std::is_signed_v<decltype(rem)>)
+      if (rem < 0) rem = (rem + yconv) % yconv;
+    return rem;
+  }
+
+  template <
+    typename TypeLhs,
+    typename TypeRhs,
+    std::enable_if_t<(std::is_floating_point_v<std::common_type_t<TypeLhs, TypeRhs>>)>* = nullptr>
+  CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y)
+  {
+    using common_t = std::common_type_t<TypeLhs, TypeRhs>;
+    common_t xconv = static_cast<common_t>(x);
+    common_t yconv = static_cast<common_t>(y);
+    auto rem       = std::fmod(xconv, yconv);
+    if (rem < 0) rem = std::fmod(rem + yconv, yconv);
+    return rem;
+  }
+};
+
+struct PyMod {
+  template <typename TypeLhs,
+            typename TypeRhs,
+            std::enable_if_t<(std::is_integral_v<std::common_type_t<TypeLhs, TypeRhs>>)>* = nullptr>
+  CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) -> decltype(((x % y) + y) % y)
+  {
+    return ((x % y) + y) % y;
+  }
+
+  template <
+    typename TypeLhs,
+    typename TypeRhs,
+    std::enable_if_t<(std::is_floating_point_v<std::common_type_t<TypeLhs, TypeRhs>>)>* = nullptr>
+  CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) -> double
+  {
+    double x1 = static_cast<double>(x);
+    double y1 = static_cast<double>(y);
+    return fmod(fmod(x1, y1) + y1, y1);
+  }
+
+  template <typename TypeLhs,
+            typename TypeRhs,
+            std::enable_if_t<(is_duration<TypeLhs>())>* = nullptr>
+  CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) -> decltype(((x % y) + y) % y)
+  {
+    return ((x % y) + y) % y;
+  }
+};
+
+struct Pow {
+  template <typename TypeLhs,
+            typename TypeRhs,
+            std::enable_if_t<(std::is_convertible_v<TypeLhs, double> and
+                              std::is_convertible_v<TypeRhs, double>)>* = nullptr>
+  CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) -> double
+  {
+    return pow(static_cast<double>(x), static_cast<double>(y));
+  }
+};
+
+struct LogBase {
+  template <typename TypeLhs,
+            typename TypeRhs,
+            std::enable_if_t<(std::is_convertible_v<TypeLhs, double> and
+                              std::is_convertible_v<TypeRhs, double>)>* = nullptr>
+  CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) -> double
+  {
+    return (std::log(static_cast<double>(x)) / std::log(static_cast<double>(y)));
+  }
+};
+
+struct ATan2 {
+  template <typename TypeLhs,
+            typename TypeRhs,
+            std::enable_if_t<(std::is_convertible_v<TypeLhs, double> and
+                              std::is_convertible_v<TypeRhs, double>)>* = nullptr>
+  CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) -> double
+  {
+    return std::atan2(static_cast<double>(x), static_cast<double>(y));
+  }
+};
+
+struct ShiftLeft {
+  template <typename TypeLhs, typename TypeRhs>
+  CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) -> decltype(x << y)
+  {
+    return (x << y);
+  }
+};
+
+struct ShiftRight {
+  template <typename TypeLhs, typename TypeRhs>
+  CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) -> decltype(x >> y)
+  {
+    return (x >> y);
+  }
+};
+
+struct ShiftRightUnsigned {
+  template <
+    typename TypeLhs,
+    typename TypeRhs,
+    std::enable_if_t<(std::is_integral_v<TypeLhs> and not is_boolean<TypeLhs>())>* = nullptr>
+  CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y)
+    -> decltype(static_cast<std::make_unsigned_t<TypeLhs>>(x) >> y)
+  {
+    return (static_cast<std::make_unsigned_t<TypeLhs>>(x) >> y);
+  }
+};
+
+struct BitwiseAnd {
+  template <typename TypeLhs, typename TypeRhs>
+  CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) -> decltype(x & y)
+  {
+    return (x & y);
+  }
+};
+
+struct BitwiseOr {
+  template <typename TypeLhs, typename TypeRhs>
+  CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) -> decltype(x | y)
+  {
+    return (x | y);
+  }
+};
+
+struct BitwiseXor {
+  template <typename TypeLhs, typename TypeRhs>
+  CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) -> decltype(x ^ y)
+  {
+    return (x ^ y);
+  }
+};
+
+struct LogicalAnd {
+  template <typename TypeLhs, typename TypeRhs>
+  CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) -> decltype(x && y)
+  {
+    return (x && y);
+  }
+};
+
+struct LogicalOr {
+  template <typename TypeLhs, typename TypeRhs>
+  CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) -> decltype(x || y)
+  {
+    return (x || y);
+  }
+};
+
+struct Equal {
+  template <typename TypeLhs, typename TypeRhs>
+  CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) -> decltype(x == y)
+  {
+    return (x == y);
+  }
+};
+
+struct NotEqual {
+  template <typename TypeLhs, typename TypeRhs>
+  CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) -> decltype(x != y)
+  {
+    return (x != y);
+  }
+};
+
+struct Less {
+  template <typename TypeLhs, typename TypeRhs>
+  CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) -> decltype(x < y)
+  {
+    return (x < y);
+  }
+};
+
+struct Greater {
+  template <typename TypeLhs, typename TypeRhs>
+  CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) -> decltype(x > y)
+  {
+    return (x > y);
+  }
+};
+
+struct LessEqual {
+  template <typename TypeLhs, typename TypeRhs>
+  CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) -> decltype(x <= y)
+  {
+    return (x <= y);
+  }
+};
+
+struct GreaterEqual {
+  template <typename TypeLhs, typename TypeRhs>
+  CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) -> decltype(x >= y)
+  {
+    return (x >= y);
+  }
+};
+
+struct NullEquals {
+  template <typename TypeLhs, typename TypeRhs>
+  CUDA_DEVICE_CALLABLE auto operator()(
+    TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid, bool& output_valid) -> decltype(x == y)
+  {
+    output_valid = true;
+    if (!lhs_valid && !rhs_valid) return true;
+    if (lhs_valid && rhs_valid) return x == y;
+    return false;
+  }
+  // To allow std::is_invocable_v = true
+  template <typename TypeLhs, typename TypeRhs>
+  CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) -> decltype(x == y);
+};
+
+struct NullMax {
+  template <typename TypeLhs,
+            typename TypeRhs,
+            typename common_t = std::common_type_t<TypeLhs, TypeRhs>>
+  CUDA_DEVICE_CALLABLE auto operator()(
+    TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid, bool& output_valid)
+    -> decltype(static_cast<common_t>(static_cast<common_t>(x) > static_cast<common_t>(y) ? x : y))
+  {
+    output_valid      = true;
+    auto const x_conv = static_cast<common_t>(x);
+    auto const y_conv = static_cast<common_t>(y);
+    if (!lhs_valid && !rhs_valid) {
+      output_valid = false;
+      return common_t{};
+    } else if (lhs_valid && rhs_valid) {
+      return (x_conv > y_conv) ? x_conv : y_conv;
+    } else if (lhs_valid)
+      return x_conv;
+    else
+      return y_conv;
+  }
+  // To allow std::is_invocable_v = true
+  template <typename TypeLhs,
+            typename TypeRhs,
+            typename common_t = std::common_type_t<TypeLhs, TypeRhs>>
+  CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y)
+    -> decltype(static_cast<common_t>(static_cast<common_t>(x) > static_cast<common_t>(y) ? x : y));
+};
+
+struct NullMin {
+  template <typename TypeLhs,
+            typename TypeRhs,
+            typename common_t = std::common_type_t<TypeLhs, TypeRhs>>
+  CUDA_DEVICE_CALLABLE auto operator()(
+    TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid, bool& output_valid)
+    -> decltype(static_cast<common_t>(static_cast<common_t>(x) < static_cast<common_t>(y) ? x : y))
+  {
+    output_valid      = true;
+    auto const x_conv = static_cast<common_t>(x);
+    auto const y_conv = static_cast<common_t>(y);
+    if (!lhs_valid && !rhs_valid) {
+      output_valid = false;
+      return common_t{};
+    } else if (lhs_valid && rhs_valid) {
+      return (x_conv < y_conv) ? x_conv : y_conv;
+    } else if (lhs_valid)
+      return x_conv;
+    else
+      return y_conv;
+  }
+  // To allow std::is_invocable_v = true
+  template <typename TypeLhs,
+            typename TypeRhs,
+            typename common_t = std::common_type_t<TypeLhs, TypeRhs>>
+  CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y)
+    -> decltype(static_cast<common_t>(static_cast<common_t>(x) < static_cast<common_t>(y) ? x : y));
+};
+
+}  // namespace ops
+}  // namespace compiled
+}  // namespace binops
+}  // namespace cudf
diff --git a/cpp/src/binaryop/compiled/util.cpp b/cpp/src/binaryop/compiled/util.cpp
new file mode 100644
index 00000000000..89320256aec
--- /dev/null
+++ b/cpp/src/binaryop/compiled/util.cpp
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "operation.cuh"
+
+#include <cudf/binaryop.hpp>
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
+
+namespace cudf::binops::compiled {
+
+namespace {
+/**
+ * @brief Functor that returns optional common type of 2 or 3 given types.
+ *
+ */
+struct common_type_functor {
+  template <typename TypeLhs, typename TypeRhs>
+  struct nested_common_type_functor {
+    template <typename TypeOut>
+    std::optional<data_type> operator()()
+    {
+      // If common_type exists
+      if constexpr (cudf::has_common_type_v<TypeOut, TypeLhs, TypeRhs>) {
+        using TypeCommon = typename std::common_type<TypeOut, TypeLhs, TypeRhs>::type;
+        return data_type{type_to_id<TypeCommon>()};
+      } else if constexpr (cudf::has_common_type_v<TypeLhs, TypeRhs>) {
+        using TypeCommon = typename std::common_type<TypeLhs, TypeRhs>::type;
+        // Eg. d=t-t
+        return data_type{type_to_id<TypeCommon>()};
+      }
+      return {};
+    }
+  };
+  template <typename TypeLhs, typename TypeRhs>
+  std::optional<data_type> operator()(data_type out)
+  {
+    return type_dispatcher(out, nested_common_type_functor<TypeLhs, TypeRhs>{});
+  }
+};
+
+/**
+ * @brief Functor that return true if BinaryOperator supports given input and output types.
+ *
+ * @tparam BinaryOperator binary operator functor
+ */
+template <typename BinaryOperator>
+struct is_binary_operation_supported {
+  // For types where Out type is fixed. (eg. comparison types)
+  template <typename TypeLhs, typename TypeRhs>
+  inline constexpr bool operator()(void)
+  {
+    if constexpr (column_device_view::has_element_accessor<TypeLhs>() and
+                  column_device_view::has_element_accessor<TypeRhs>()) {
+      if constexpr (has_common_type_v<TypeLhs, TypeRhs>) {
+        using common_t = std::common_type_t<TypeLhs, TypeRhs>;
+        return std::is_invocable_v<BinaryOperator, common_t, common_t>;
+      } else
+        return std::is_invocable_v<BinaryOperator, TypeLhs, TypeRhs>;
+    } else {
+      return false;
+    }
+  }
+
+  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
+  inline constexpr bool operator()(void)
+  {
+    if constexpr (column_device_view::has_element_accessor<TypeLhs>() and
+                  column_device_view::has_element_accessor<TypeRhs>() and
+                  (mutable_column_device_view::has_element_accessor<TypeOut>() or
+                   is_fixed_point<TypeOut>())) {
+      if constexpr (has_common_type_v<TypeLhs, TypeRhs>) {
+        using common_t = std::common_type_t<TypeLhs, TypeRhs>;
+        if constexpr (std::is_invocable_v<BinaryOperator, common_t, common_t>) {
+          using ReturnType = std::invoke_result_t<BinaryOperator, common_t, common_t>;
+          return std::is_constructible_v<TypeOut, ReturnType>;
+        }
+      } else {
+        if constexpr (std::is_invocable_v<BinaryOperator, TypeLhs, TypeRhs>) {
+          using ReturnType = std::invoke_result_t<BinaryOperator, TypeLhs, TypeRhs>;
+          return std::is_constructible_v<TypeOut, ReturnType>;
+        }
+      }
+    }
+    return false;
+  }
+};
+
+struct is_supported_operation_functor {
+  template <typename TypeLhs, typename TypeRhs>
+  struct nested_support_functor {
+    template <typename BinaryOperator, typename TypeOut>
+    inline constexpr bool call()
+    {
+      return is_binary_operation_supported<BinaryOperator>{}
+        .template operator()<TypeOut, TypeLhs, TypeRhs>();
+    }
+    template <typename TypeOut>
+    inline constexpr bool operator()(binary_operator op)
+    {
+      switch (op) {
+        // clang-format off
+        case binary_operator::ADD:                  return call<ops::Add, TypeOut>();
+        case binary_operator::SUB:                  return call<ops::Sub, TypeOut>();
+        case binary_operator::MUL:                  return call<ops::Mul, TypeOut>();
+        case binary_operator::DIV:                  return call<ops::Div, TypeOut>();
+        case binary_operator::TRUE_DIV:             return call<ops::TrueDiv, TypeOut>();
+        case binary_operator::FLOOR_DIV:            return call<ops::FloorDiv, TypeOut>();
+        case binary_operator::MOD:                  return call<ops::Mod, TypeOut>();
+        case binary_operator::PYMOD:                return call<ops::PyMod, TypeOut>();
+        case binary_operator::POW:                  return call<ops::Pow, TypeOut>();
+        case binary_operator::BITWISE_AND:          return call<ops::BitwiseAnd, TypeOut>();
+        case binary_operator::BITWISE_OR:           return call<ops::BitwiseOr, TypeOut>();
+        case binary_operator::BITWISE_XOR:          return call<ops::BitwiseXor, TypeOut>();
+        case binary_operator::SHIFT_LEFT:           return call<ops::ShiftLeft, TypeOut>();
+        case binary_operator::SHIFT_RIGHT:          return call<ops::ShiftRight, TypeOut>();
+        case binary_operator::SHIFT_RIGHT_UNSIGNED: return call<ops::ShiftRightUnsigned, TypeOut>();
+        case binary_operator::LOG_BASE:             return call<ops::LogBase, TypeOut>();
+        case binary_operator::ATAN2:                return call<ops::ATan2, TypeOut>();
+        case binary_operator::PMOD:                 return call<ops::PMod, TypeOut>();
+        case binary_operator::NULL_MAX:             return call<ops::NullMax, TypeOut>();
+        case binary_operator::NULL_MIN:             return call<ops::NullMin, TypeOut>();
+        /*
+        case binary_operator::GENERIC_BINARY:       // defined in jit only.
+        */
+        default:                                    return false;
+          // clang-format on
+      }
+    }
+  };
+
+  template <typename BinaryOperator, typename TypeLhs, typename TypeRhs>
+  inline constexpr bool bool_op(data_type out)
+  {
+    return out.id() == type_id::BOOL8 and
+           is_binary_operation_supported<BinaryOperator>{}.template operator()<TypeLhs, TypeRhs>();
+  }
+  template <typename TypeLhs, typename TypeRhs>
+  inline constexpr bool operator()(data_type out, binary_operator op)
+  {
+    switch (op) {
+      // output type should be bool type.
+      case binary_operator::LOGICAL_AND: return bool_op<ops::LogicalAnd, TypeLhs, TypeRhs>(out);
+      case binary_operator::LOGICAL_OR: return bool_op<ops::LogicalOr, TypeLhs, TypeRhs>(out);
+      case binary_operator::EQUAL: return bool_op<ops::Equal, TypeLhs, TypeRhs>(out);
+      case binary_operator::NOT_EQUAL: return bool_op<ops::NotEqual, TypeLhs, TypeRhs>(out);
+      case binary_operator::LESS: return bool_op<ops::Less, TypeLhs, TypeRhs>(out);
+      case binary_operator::GREATER: return bool_op<ops::Greater, TypeLhs, TypeRhs>(out);
+      case binary_operator::LESS_EQUAL: return bool_op<ops::LessEqual, TypeLhs, TypeRhs>(out);
+      case binary_operator::GREATER_EQUAL: return bool_op<ops::GreaterEqual, TypeLhs, TypeRhs>(out);
+      case binary_operator::NULL_EQUALS: return bool_op<ops::NullEquals, TypeLhs, TypeRhs>(out);
+      default: return type_dispatcher(out, nested_support_functor<TypeLhs, TypeRhs>{}, op);
+    }
+    return false;
+  }
+};
+
+}  // namespace
+
+std::optional<data_type> get_common_type(data_type out, data_type lhs, data_type rhs)
+{
+  return double_type_dispatcher(lhs, rhs, common_type_functor{}, out);
+}
+
+bool is_supported_operation(data_type out, data_type lhs, data_type rhs, binary_operator op)
+{
+  return double_type_dispatcher(lhs, rhs, is_supported_operation_functor{}, out, op);
+}
+}  // namespace cudf::binops::compiled
diff --git a/cpp/src/bitmask/null_mask.cu b/cpp/src/bitmask/null_mask.cu
index 28d1411c30d..c3add0ea97e 100644
--- a/cpp/src/bitmask/null_mask.cu
+++ b/cpp/src/bitmask/null_mask.cu
@@ -80,7 +80,7 @@ namespace detail {
 rmm::device_buffer create_null_mask(size_type size,
                                     mask_state state,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource *mr)
+                                    rmm::mr::device_memory_resource* mr)
 {
   size_type mask_size{0};
 
@@ -91,14 +91,14 @@ rmm::device_buffer create_null_mask(size_type size,
   if (state != mask_state::UNINITIALIZED) {
     uint8_t fill_value = (state == mask_state::ALL_VALID) ? 0xff : 0x00;
     CUDA_TRY(cudaMemsetAsync(
-      static_cast<bitmask_type *>(mask.data()), fill_value, mask_size, stream.value()));
+      static_cast<bitmask_type*>(mask.data()), fill_value, mask_size, stream.value()));
   }
 
   return mask;
 }
 
 namespace {
-__global__ void set_null_mask_kernel(bitmask_type *__restrict__ destination,
+__global__ void set_null_mask_kernel(bitmask_type* __restrict__ destination,
                                      size_type begin_bit,
                                      size_type end_bit,
                                      bool valid,
@@ -130,7 +130,7 @@ __global__ void set_null_mask_kernel(bitmask_type *__restrict__ destination,
 
 // Set pre-allocated null mask of given bit range [begin_bit, end_bit) to valid, if valid==true,
 // or null, otherwise;
-void set_null_mask(bitmask_type *bitmask,
+void set_null_mask(bitmask_type* bitmask,
                    size_type begin_bit,
                    size_type end_bit,
                    bool valid,
@@ -145,7 +145,7 @@ void set_null_mask(bitmask_type *bitmask,
       num_bitmask_words(end_bit) - begin_bit / detail::size_in_bits<bitmask_type>();
     cudf::detail::grid_1d config(number_of_mask_words, 256);
     set_null_mask_kernel<<<config.num_blocks, config.num_threads_per_block, 0, stream.value()>>>(
-      static_cast<bitmask_type *>(bitmask), begin_bit, end_bit, valid, number_of_mask_words);
+      static_cast<bitmask_type*>(bitmask), begin_bit, end_bit, valid, number_of_mask_words);
     CHECK_CUDA(stream.value());
   }
 }
@@ -155,14 +155,14 @@ void set_null_mask(bitmask_type *bitmask,
 // Create a device_buffer for a null mask
 rmm::device_buffer create_null_mask(size_type size,
                                     mask_state state,
-                                    rmm::mr::device_memory_resource *mr)
+                                    rmm::mr::device_memory_resource* mr)
 {
   return detail::create_null_mask(size, state, rmm::cuda_stream_default, mr);
 }
 
 // Set pre-allocated null mask of given bit range [begin_bit, end_bit) to valid, if valid==true,
 // or null, otherwise;
-void set_null_mask(bitmask_type *bitmask, size_type begin_bit, size_type end_bit, bool valid)
+void set_null_mask(bitmask_type* bitmask, size_type begin_bit, size_type end_bit, bool valid)
 {
   return detail::set_null_mask(bitmask, begin_bit, end_bit, valid);
 }
@@ -181,10 +181,10 @@ namespace {
  * @param[out] global_count The number of non-zero bits in the specified range
  */
 template <size_type block_size>
-__global__ void count_set_bits_kernel(bitmask_type const *bitmask,
+__global__ void count_set_bits_kernel(bitmask_type const* bitmask,
                                       size_type first_bit_index,
                                       size_type last_bit_index,
-                                      size_type *global_count)
+                                      size_type* global_count)
 {
   constexpr auto const word_size{detail::size_in_bits<bitmask_type>()};
 
@@ -215,7 +215,7 @@ __global__ void count_set_bits_kernel(bitmask_type const *bitmask,
     if (num_slack_bits > 0) {
       bitmask_type word = bitmask[word_index];
       auto slack_mask   = (first) ? set_least_significant_bits(num_slack_bits)
-                                : set_most_significant_bits(num_slack_bits);
+                                  : set_most_significant_bits(num_slack_bits);
 
       thread_count -= __popc(word & slack_mask);
     }
@@ -248,7 +248,7 @@ __global__ void count_set_bits_kernel(bitmask_type const *bitmask,
  * updated
  */
 template <typename OffsetIterator, typename OutputIterator>
-__global__ void subtract_set_bits_range_boundaries_kerenel(bitmask_type const *bitmask,
+__global__ void subtract_set_bits_range_boundaries_kerenel(bitmask_type const* bitmask,
                                                            size_type num_ranges,
                                                            OffsetIterator first_bit_indices,
                                                            OffsetIterator last_bit_indices,
@@ -305,8 +305,8 @@ __global__ void subtract_set_bits_range_boundaries_kerenel(bitmask_type const *b
  * @param number_of_mask_words The number of `cudf::bitmask_type` words to copy
  */
 // TODO: Also make binops test that uses offset in column_view
-__global__ void copy_offset_bitmask(bitmask_type *__restrict__ destination,
-                                    bitmask_type const *__restrict__ source,
+__global__ void copy_offset_bitmask(bitmask_type* __restrict__ destination,
+                                    bitmask_type const* __restrict__ source,
                                     size_type source_begin_bit,
                                     size_type source_end_bit,
                                     size_type number_of_mask_words)
@@ -323,7 +323,7 @@ __global__ void copy_offset_bitmask(bitmask_type *__restrict__ destination,
 // [first_word_index,last_word_index)
 struct to_word_index : public thrust::unary_function<size_type, size_type> {
   const bool _inclusive                 = false;
-  size_type const *const _d_bit_indices = nullptr;
+  size_type const* const _d_bit_indices = nullptr;
 
   /**
    * @brief Constructor of a functor that converts bit indices to bitmask word
@@ -333,12 +333,12 @@ struct to_word_index : public thrust::unary_function<size_type, size_type> {
    * or exclusive.
    * @param[in] d_bit_indices Pointer to an array of bit indices
    */
-  __host__ to_word_index(bool inclusive, size_type const *d_bit_indices)
+  __host__ to_word_index(bool inclusive, size_type const* d_bit_indices)
     : _inclusive(inclusive), _d_bit_indices(d_bit_indices)
   {
   }
 
-  __device__ size_type operator()(const size_type &i) const
+  __device__ size_type operator()(const size_type& i) const
   {
     auto bit_index = _d_bit_indices[i];
     return word_index(bit_index) + ((_inclusive || intra_word_index(bit_index) == 0) ? 0 : 1);
@@ -350,11 +350,11 @@ struct to_word_index : public thrust::unary_function<size_type, size_type> {
 namespace detail {
 
 // Create a bitmask from a specific range
-rmm::device_buffer copy_bitmask(bitmask_type const *mask,
+rmm::device_buffer copy_bitmask(bitmask_type const* mask,
                                 size_type begin_bit,
                                 size_type end_bit,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource *mr)
+                                rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(begin_bit >= 0, "Invalid range.");
@@ -363,26 +363,22 @@ rmm::device_buffer copy_bitmask(bitmask_type const *mask,
   auto num_bytes = bitmask_allocation_size_bytes(end_bit - begin_bit);
   if ((mask == nullptr) || (num_bytes == 0)) { return dest_mask; }
   if (begin_bit == 0) {
-    dest_mask = rmm::device_buffer{static_cast<void const *>(mask), num_bytes, stream, mr};
+    dest_mask = rmm::device_buffer{static_cast<void const*>(mask), num_bytes, stream, mr};
   } else {
     auto number_of_mask_words = num_bitmask_words(end_bit - begin_bit);
     dest_mask                 = rmm::device_buffer{num_bytes, stream, mr};
     cudf::detail::grid_1d config(number_of_mask_words, 256);
     copy_offset_bitmask<<<config.num_blocks, config.num_threads_per_block, 0, stream.value()>>>(
-      static_cast<bitmask_type *>(dest_mask.data()),
-      mask,
-      begin_bit,
-      end_bit,
-      number_of_mask_words);
+      static_cast<bitmask_type*>(dest_mask.data()), mask, begin_bit, end_bit, number_of_mask_words);
     CHECK_CUDA(stream.value());
   }
   return dest_mask;
 }
 
 // Create a bitmask from a column view
-rmm::device_buffer copy_bitmask(column_view const &view,
+rmm::device_buffer copy_bitmask(column_view const& view,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource *mr)
+                                rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
   rmm::device_buffer null_mask{0, stream, mr};
@@ -395,11 +391,11 @@ rmm::device_buffer copy_bitmask(column_view const &view,
 
 // Inplace Bitwise AND of the masks
 void inplace_bitmask_and(device_span<bitmask_type> dest_mask,
-                         host_span<bitmask_type const *> masks,
+                         host_span<bitmask_type const*> masks,
                          host_span<size_type const> begin_bits,
                          size_type mask_size,
                          rmm::cuda_stream_view stream,
-                         rmm::mr::device_memory_resource *mr)
+                         rmm::mr::device_memory_resource* mr)
 {
   inplace_bitmask_binop(
     [] __device__(bitmask_type left, bitmask_type right) { return left & right; },
@@ -412,11 +408,11 @@ void inplace_bitmask_and(device_span<bitmask_type> dest_mask,
 }
 
 // Bitwise AND of the masks
-rmm::device_buffer bitmask_and(host_span<bitmask_type const *> masks,
+rmm::device_buffer bitmask_and(host_span<bitmask_type const*> masks,
                                host_span<size_type const> begin_bits,
                                size_type mask_size,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource *mr)
+                               rmm::mr::device_memory_resource* mr)
 {
   return bitmask_binop(
     [] __device__(bitmask_type left, bitmask_type right) { return left & right; },
@@ -427,7 +423,7 @@ rmm::device_buffer bitmask_and(host_span<bitmask_type const *> masks,
     mr);
 }
 
-cudf::size_type count_set_bits(bitmask_type const *bitmask,
+cudf::size_type count_set_bits(bitmask_type const* bitmask,
                                size_type start,
                                size_type stop,
                                rmm::cuda_stream_view stream = rmm::cuda_stream_default)
@@ -455,7 +451,7 @@ cudf::size_type count_set_bits(bitmask_type const *bitmask,
   return non_zero_count.value(stream);
 }
 
-cudf::size_type count_unset_bits(bitmask_type const *bitmask,
+cudf::size_type count_unset_bits(bitmask_type const* bitmask,
                                  size_type start,
                                  size_type stop,
                                  rmm::cuda_stream_view stream = rmm::cuda_stream_default)
@@ -465,7 +461,7 @@ cudf::size_type count_unset_bits(bitmask_type const *bitmask,
   return (num_bits - detail::count_set_bits(bitmask, start, stop, stream));
 }
 
-std::vector<size_type> segmented_count_set_bits(bitmask_type const *bitmask,
+std::vector<size_type> segmented_count_set_bits(bitmask_type const* bitmask,
                                                 host_span<size_type const> indices,
                                                 rmm::cuda_stream_view stream)
 {
@@ -517,7 +513,7 @@ std::vector<size_type> segmented_count_set_bits(bitmask_type const *bitmask,
     // first_word_indices and last_word_indices to have the same type.
     to_word_index(false, d_last_indices.data()));
 
-  // first allocate temporary memroy
+  // first allocate temporary memory
 
   size_t temp_storage_bytes{0};
   CUDA_TRY(cub::DeviceSegmentedReduce::Sum(nullptr,
@@ -570,7 +566,7 @@ std::vector<size_type> segmented_count_set_bits(bitmask_type const *bitmask,
   return ret;
 }
 
-std::vector<size_type> segmented_count_unset_bits(bitmask_type const *bitmask,
+std::vector<size_type> segmented_count_unset_bits(bitmask_type const* bitmask,
                                                   host_span<size_type const> indices,
                                                   rmm::cuda_stream_view stream)
 {
@@ -591,17 +587,17 @@ std::vector<size_type> segmented_count_unset_bits(bitmask_type const *bitmask,
 }
 
 // Returns the bitwise AND of the null masks of all columns in the table view
-rmm::device_buffer bitmask_and(table_view const &view,
+rmm::device_buffer bitmask_and(table_view const& view,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource *mr)
+                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
   rmm::device_buffer null_mask{0, stream, mr};
   if (view.num_rows() == 0 or view.num_columns() == 0) { return null_mask; }
 
-  std::vector<bitmask_type const *> masks;
+  std::vector<bitmask_type const*> masks;
   std::vector<size_type> offsets;
-  for (auto &&col : view) {
+  for (auto&& col : view) {
     if (col.nullable()) {
       masks.push_back(col.null_mask());
       offsets.push_back(col.offset());
@@ -622,17 +618,17 @@ rmm::device_buffer bitmask_and(table_view const &view,
 }
 
 // Returns the bitwise OR of the null masks of all columns in the table view
-rmm::device_buffer bitmask_or(table_view const &view,
+rmm::device_buffer bitmask_or(table_view const& view,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource *mr)
+                              rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
   rmm::device_buffer null_mask{0, stream, mr};
   if (view.num_rows() == 0 or view.num_columns() == 0) { return null_mask; }
 
-  std::vector<bitmask_type const *> masks;
+  std::vector<bitmask_type const*> masks;
   std::vector<size_type> offsets;
-  for (auto &&col : view) {
+  for (auto&& col : view) {
     if (col.nullable()) {
       masks.push_back(col.null_mask());
       offsets.push_back(col.offset());
@@ -654,21 +650,21 @@ rmm::device_buffer bitmask_or(table_view const &view,
 }  // namespace detail
 
 // Count non-zero bits in the specified range
-cudf::size_type count_set_bits(bitmask_type const *bitmask, size_type start, size_type stop)
+cudf::size_type count_set_bits(bitmask_type const* bitmask, size_type start, size_type stop)
 {
   CUDF_FUNC_RANGE();
   return detail::count_set_bits(bitmask, start, stop);
 }
 
 // Count zero bits in the specified range
-cudf::size_type count_unset_bits(bitmask_type const *bitmask, size_type start, size_type stop)
+cudf::size_type count_unset_bits(bitmask_type const* bitmask, size_type start, size_type stop)
 {
   CUDF_FUNC_RANGE();
   return detail::count_unset_bits(bitmask, start, stop);
 }
 
 // Count non-zero bits in the specified ranges
-std::vector<size_type> segmented_count_set_bits(bitmask_type const *bitmask,
+std::vector<size_type> segmented_count_set_bits(bitmask_type const* bitmask,
                                                 host_span<size_type const> indices)
 {
   CUDF_FUNC_RANGE();
@@ -676,7 +672,7 @@ std::vector<size_type> segmented_count_set_bits(bitmask_type const *bitmask,
 }
 
 // Count zero bits in the specified ranges
-std::vector<size_type> segmented_count_unset_bits(bitmask_type const *bitmask,
+std::vector<size_type> segmented_count_unset_bits(bitmask_type const* bitmask,
                                                   host_span<size_type const> indices)
 {
   CUDF_FUNC_RANGE();
@@ -684,26 +680,26 @@ std::vector<size_type> segmented_count_unset_bits(bitmask_type const *bitmask,
 }
 
 // Create a bitmask from a specific range
-rmm::device_buffer copy_bitmask(bitmask_type const *mask,
+rmm::device_buffer copy_bitmask(bitmask_type const* mask,
                                 size_type begin_bit,
                                 size_type end_bit,
-                                rmm::mr::device_memory_resource *mr)
+                                rmm::mr::device_memory_resource* mr)
 {
   return detail::copy_bitmask(mask, begin_bit, end_bit, rmm::cuda_stream_default, mr);
 }
 
 // Create a bitmask from a column view
-rmm::device_buffer copy_bitmask(column_view const &view, rmm::mr::device_memory_resource *mr)
+rmm::device_buffer copy_bitmask(column_view const& view, rmm::mr::device_memory_resource* mr)
 {
   return detail::copy_bitmask(view, rmm::cuda_stream_default, mr);
 }
 
-rmm::device_buffer bitmask_and(table_view const &view, rmm::mr::device_memory_resource *mr)
+rmm::device_buffer bitmask_and(table_view const& view, rmm::mr::device_memory_resource* mr)
 {
   return detail::bitmask_and(view, rmm::cuda_stream_default, mr);
 }
 
-rmm::device_buffer bitmask_or(table_view const &view, rmm::mr::device_memory_resource *mr)
+rmm::device_buffer bitmask_or(table_view const& view, rmm::mr::device_memory_resource* mr)
 {
   return detail::bitmask_or(view, rmm::cuda_stream_default, mr);
 }
diff --git a/cpp/src/column/column.cu b/cpp/src/column/column.cu
index 3ee8e0a33a9..2a0496b316b 100644
--- a/cpp/src/column/column.cu
+++ b/cpp/src/column/column.cu
@@ -45,9 +45,9 @@
 namespace cudf {
 
 // Copy ctor w/ optional stream/mr
-column::column(column const &other,
+column::column(column const& other,
                rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource *mr)
+               rmm::mr::device_memory_resource* mr)
   : _type{other._type},
     _size{other._size},
     _data{other._data, stream, mr},
@@ -55,13 +55,13 @@ column::column(column const &other,
     _null_count{other._null_count}
 {
   _children.reserve(other.num_children());
-  for (auto const &c : other._children) {
+  for (auto const& c : other._children) {
     _children.emplace_back(std::make_unique<column>(*c, stream, mr));
   }
 }
 
 // Move constructor
-column::column(column &&other) noexcept
+column::column(column&& other) noexcept
   : _type{other._type},
     _size{other._size},
     _data{std::move(other._data)},
@@ -91,12 +91,14 @@ column_view column::view() const
   // Create views of children
   std::vector<column_view> child_views;
   child_views.reserve(_children.size());
-  for (auto const &c : _children) { child_views.emplace_back(*c); }
+  for (auto const& c : _children) {
+    child_views.emplace_back(*c);
+  }
 
   return column_view{type(),
                      size(),
                      _data.data(),
-                     static_cast<bitmask_type const *>(_null_mask.data()),
+                     static_cast<bitmask_type const*>(_null_mask.data()),
                      null_count(),
                      0,
                      child_views};
@@ -110,7 +112,9 @@ mutable_column_view column::mutable_view()
   // create views of children
   std::vector<mutable_column_view> child_views;
   child_views.reserve(_children.size());
-  for (auto const &c : _children) { child_views.emplace_back(*c); }
+  for (auto const& c : _children) {
+    child_views.emplace_back(*c);
+  }
 
   // Store the old null count before resetting it. By accessing the value directly instead of
   // calling `null_count()`, we can avoid a potential invocation of `count_unset_bits()`. This does
@@ -126,7 +130,7 @@ mutable_column_view column::mutable_view()
   return mutable_column_view{type(),
                              size(),
                              _data.data(),
-                             static_cast<bitmask_type *>(_null_mask.data()),
+                             static_cast<bitmask_type*>(_null_mask.data()),
                              current_null_count,
                              0,
                              child_views};
@@ -138,12 +142,12 @@ size_type column::null_count() const
   CUDF_FUNC_RANGE();
   if (_null_count <= cudf::UNKNOWN_NULL_COUNT) {
     _null_count =
-      cudf::count_unset_bits(static_cast<bitmask_type const *>(_null_mask.data()), 0, size());
+      cudf::count_unset_bits(static_cast<bitmask_type const*>(_null_mask.data()), 0, size());
   }
   return _null_count;
 }
 
-void column::set_null_mask(rmm::device_buffer &&new_null_mask, size_type new_null_count)
+void column::set_null_mask(rmm::device_buffer&& new_null_mask, size_type new_null_count)
 {
   if (new_null_count > 0) {
     CUDF_EXPECTS(new_null_mask.size() >= cudf::bitmask_allocation_size_bytes(this->size()),
@@ -154,7 +158,7 @@ void column::set_null_mask(rmm::device_buffer &&new_null_mask, size_type new_nul
   _null_count = new_null_count;
 }
 
-void column::set_null_mask(rmm::device_buffer const &new_null_mask,
+void column::set_null_mask(rmm::device_buffer const& new_null_mask,
                            size_type new_null_count,
                            rmm::cuda_stream_view stream)
 {
@@ -177,10 +181,10 @@ namespace {
 struct create_column_from_view {
   cudf::column_view view;
   rmm::cuda_stream_view stream{};
-  rmm::mr::device_memory_resource *mr;
+  rmm::mr::device_memory_resource* mr;
 
   template <typename ColumnType,
-            std::enable_if_t<std::is_same<ColumnType, cudf::string_view>::value> * = nullptr>
+            std::enable_if_t<std::is_same<ColumnType, cudf::string_view>::value>* = nullptr>
   std::unique_ptr<column> operator()()
   {
     cudf::strings_column_view sview(view);
@@ -188,7 +192,7 @@ struct create_column_from_view {
   }
 
   template <typename ColumnType,
-            std::enable_if_t<std::is_same<ColumnType, cudf::dictionary32>::value> * = nullptr>
+            std::enable_if_t<std::is_same<ColumnType, cudf::dictionary32>::value>* = nullptr>
   std::unique_ptr<column> operator()()
   {
     std::vector<std::unique_ptr<column>> children;
@@ -211,10 +215,10 @@ struct create_column_from_view {
                                     std::move(children));
   }
 
-  template <typename ColumnType, std::enable_if_t<cudf::is_fixed_width<ColumnType>()> * = nullptr>
+  template <typename ColumnType, std::enable_if_t<cudf::is_fixed_width<ColumnType>()>* = nullptr>
   std::unique_ptr<column> operator()()
   {
-    auto op       = [&](auto const &child) { return std::make_unique<column>(child, stream, mr); };
+    auto op       = [&](auto const& child) { return std::make_unique<column>(child, stream, mr); };
     auto begin    = thrust::make_transform_iterator(view.child_begin(), op);
     auto children = std::vector<std::unique_ptr<column>>(begin, begin + view.num_children());
 
@@ -222,7 +226,7 @@ struct create_column_from_view {
       view.type(),
       view.size(),
       rmm::device_buffer{
-        static_cast<const char *>(view.head()) + (view.offset() * cudf::size_of(view.type())),
+        static_cast<const char*>(view.head()) + (view.offset() * cudf::size_of(view.type())),
         view.size() * cudf::size_of(view.type()),
         stream,
         mr},
@@ -232,7 +236,7 @@ struct create_column_from_view {
   }
 
   template <typename ColumnType,
-            std::enable_if_t<std::is_same<ColumnType, cudf::list_view>::value> * = nullptr>
+            std::enable_if_t<std::is_same<ColumnType, cudf::list_view>::value>* = nullptr>
   std::unique_ptr<column> operator()()
   {
     auto lists_view = lists_column_view(view);
@@ -240,7 +244,7 @@ struct create_column_from_view {
   }
 
   template <typename ColumnType,
-            std::enable_if_t<std::is_same<ColumnType, cudf::struct_view>::value> * = nullptr>
+            std::enable_if_t<std::is_same<ColumnType, cudf::struct_view>::value>* = nullptr>
   std::unique_ptr<column> operator()()
   {
     if (view.is_empty()) { return cudf::empty_like(view); }
@@ -271,7 +275,7 @@ struct create_column_from_view {
 }  // anonymous namespace
 
 // Copy from a view
-column::column(column_view view, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr)
+column::column(column_view view, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
   :  // Move is needed here because the dereference operator of unique_ptr returns
      // an lvalue reference, which would otherwise dispatch to the copy constructor
     column{std::move(*type_dispatcher(view.type(), create_column_from_view{view, stream, mr}))}
diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu
index 4b11382a3f2..d4d54a3f94f 100644
--- a/cpp/src/copying/contiguous_split.cu
+++ b/cpp/src/copying/contiguous_split.cu
@@ -53,7 +53,7 @@ inline __device__ std::size_t _round_up_safe(std::size_t number_to_round, std::s
  * The definition of "buffer" used throughout this module is a component piece of a
  * cudf column. So for example, a fixed-width column with validity would have 2 associated
  * buffers : the data itself and the validity buffer.  contiguous_split operates by breaking
- * each column up into it's individual components and copying each one as a seperate kernel
+ * each column up into it's individual components and copying each one as a separate kernel
  * block.
  */
 struct src_buf_info {
@@ -188,7 +188,7 @@ __device__ void copy_buffer(uint8_t* __restrict__ dst,
     }
 
     // if we're performing a value shift (offsets), or a bit shift (validity) the # of bytes and
-    // alignment must be a multiple of 4. value shifting and bit shifting are mututally exclusive
+    // alignment must be a multiple of 4. value shifting and bit shifting are mutually exclusive
     // and will never both be true at the same time.
     if (value_shift || bit_shift) {
       std::size_t idx = (num_bytes - remainder) / 4;
@@ -249,7 +249,7 @@ __device__ void copy_buffer(uint8_t* __restrict__ dst,
  *
  * @param num_src_bufs Total number of source buffers (N)
  * @param src_bufs Input source buffers (N)
- * @param dst_bufs Desination buffers (N*M)
+ * @param dst_bufs Destination buffers (N*M)
  * @param buf_info Information on the range of values to be copied for each destination buffer.
  */
 template <int block_size>
diff --git a/cpp/src/copying/get_element.cu b/cpp/src/copying/get_element.cu
index 67d96bbc7ce..9456ae06b21 100644
--- a/cpp/src/copying/get_element.cu
+++ b/cpp/src/copying/get_element.cu
@@ -34,17 +34,17 @@ namespace detail {
 namespace {
 
 struct get_element_functor {
-  template <typename T, std::enable_if_t<is_fixed_width<T>() && !is_fixed_point<T>()> *p = nullptr>
+  template <typename T, std::enable_if_t<is_fixed_width<T>() && !is_fixed_point<T>()>* p = nullptr>
   std::unique_ptr<scalar> operator()(
-    column_view const &input,
+    column_view const& input,
     size_type index,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource())
+    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
   {
     auto s = make_fixed_width_scalar(data_type(type_to_id<T>()), stream, mr);
 
     using ScalarType = cudf::scalar_type_t<T>;
-    auto typed_s     = static_cast<ScalarType *>(s.get());
+    auto typed_s     = static_cast<ScalarType*>(s.get());
 
     auto device_s   = get_scalar_device_view(*typed_s);
     auto device_col = column_device_view::create(input, stream);
@@ -58,12 +58,12 @@ struct get_element_functor {
     return s;
   }
 
-  template <typename T, std::enable_if_t<std::is_same<T, string_view>::value> *p = nullptr>
+  template <typename T, std::enable_if_t<std::is_same<T, string_view>::value>* p = nullptr>
   std::unique_ptr<scalar> operator()(
-    column_view const &input,
+    column_view const& input,
     size_type index,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource())
+    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
   {
     auto device_col = column_device_view::create(input, stream);
 
@@ -83,12 +83,12 @@ struct get_element_functor {
     return std::make_unique<string_scalar>(temp_data, temp_valid.value(stream), stream, mr);
   }
 
-  template <typename T, std::enable_if_t<std::is_same<T, dictionary32>::value> *p = nullptr>
+  template <typename T, std::enable_if_t<std::is_same<T, dictionary32>::value>* p = nullptr>
   std::unique_ptr<scalar> operator()(
-    column_view const &input,
+    column_view const& input,
     size_type index,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource())
+    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
   {
     auto dict_view    = dictionary_column_view(input);
     auto indices_iter = detail::indexalator_factory::make_input_iterator(dict_view.indices());
@@ -119,12 +119,12 @@ struct get_element_functor {
                            mr);
   }
 
-  template <typename T, std::enable_if_t<std::is_same<T, list_view>::value> *p = nullptr>
+  template <typename T, std::enable_if_t<std::is_same<T, list_view>::value>* p = nullptr>
   std::unique_ptr<scalar> operator()(
-    column_view const &input,
+    column_view const& input,
     size_type index,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource())
+    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
   {
     bool valid               = is_element_valid_sync(input, index, stream);
     auto const child_col_idx = lists_column_view::child_column_index;
@@ -144,12 +144,12 @@ struct get_element_functor {
     }
   }
 
-  template <typename T, std::enable_if_t<cudf::is_fixed_point<T>()> *p = nullptr>
+  template <typename T, std::enable_if_t<cudf::is_fixed_point<T>()>* p = nullptr>
   std::unique_ptr<scalar> operator()(
-    column_view const &input,
+    column_view const& input,
     size_type index,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource())
+    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
   {
     using Type = typename T::rep;
 
@@ -175,12 +175,12 @@ struct get_element_functor {
                                                    mr);
   }
 
-  template <typename T, std::enable_if_t<std::is_same<T, struct_view>::value> *p = nullptr>
+  template <typename T, std::enable_if_t<std::is_same<T, struct_view>::value>* p = nullptr>
   std::unique_ptr<scalar> operator()(
-    column_view const &input,
+    column_view const& input,
     size_type index,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource())
+    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
   {
     bool valid = is_element_valid_sync(input, index, stream);
     auto row_contents =
@@ -192,10 +192,10 @@ struct get_element_functor {
 
 }  // namespace
 
-std::unique_ptr<scalar> get_element(column_view const &input,
+std::unique_ptr<scalar> get_element(column_view const& input,
                                     size_type index,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource *mr)
+                                    rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(index >= 0 and index < input.size(), "Index out of bounds");
   return type_dispatcher(input.type(), get_element_functor{}, input, index, stream, mr);
@@ -203,9 +203,9 @@ std::unique_ptr<scalar> get_element(column_view const &input,
 
 }  // namespace detail
 
-std::unique_ptr<scalar> get_element(column_view const &input,
+std::unique_ptr<scalar> get_element(column_view const& input,
                                     size_type index,
-                                    rmm::mr::device_memory_resource *mr)
+                                    rmm::mr::device_memory_resource* mr)
 {
   return detail::get_element(input, index, rmm::cuda_stream_default, mr);
 }
diff --git a/cpp/src/copying/pack.cpp b/cpp/src/copying/pack.cpp
index 182e3ff0584..89e5972f448 100644
--- a/cpp/src/copying/pack.cpp
+++ b/cpp/src/copying/pack.cpp
@@ -145,7 +145,7 @@ packed_columns pack(cudf::table_view const& input,
   // do a contiguous_split with no splits to get the memory for the table
   // arranged as we want it
   auto contig_split_result = cudf::detail::contiguous_split(input, {}, stream, mr);
-  return std::move(contig_split_result[0].data);
+  return contig_split_result.empty() ? packed_columns{} : std::move(contig_split_result[0].data);
 }
 
 template <typename ColumnIter>
@@ -229,7 +229,9 @@ packed_columns::metadata pack_metadata(table_view const& table,
                                        size_t buffer_size)
 {
   CUDF_FUNC_RANGE();
-  return detail::pack_metadata(table.begin(), table.end(), contiguous_buffer, buffer_size);
+  return table.is_empty()
+           ? packed_columns::metadata{}
+           : detail::pack_metadata(table.begin(), table.end(), contiguous_buffer, buffer_size);
 }
 
 /**
@@ -238,8 +240,10 @@ packed_columns::metadata pack_metadata(table_view const& table,
 table_view unpack(packed_columns const& input)
 {
   CUDF_FUNC_RANGE();
-  return detail::unpack(input.metadata_->data(),
-                        reinterpret_cast<uint8_t const*>(input.gpu_data->data()));
+  return input.metadata_->size() == 0
+           ? table_view{}
+           : detail::unpack(input.metadata_->data(),
+                            reinterpret_cast<uint8_t const*>(input.gpu_data->data()));
 }
 
 /**
diff --git a/cpp/src/copying/scatter.cu b/cpp/src/copying/scatter.cu
index a932957ada4..3312316f548 100644
--- a/cpp/src/copying/scatter.cu
+++ b/cpp/src/copying/scatter.cu
@@ -19,6 +19,7 @@
 #include <cudf/detail/gather.cuh>
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/indexalator.cuh>
+#include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/scatter.cuh>
@@ -27,6 +28,7 @@
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/dictionary/detail/search.hpp>
 #include <cudf/lists/list_view.cuh>
+#include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/strings/detail/scatter.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/structs/struct_view.hpp>
@@ -63,32 +65,32 @@ __global__ void marking_bitmask_kernel(mutable_column_device_view destination,
 }
 
 template <typename MapIterator>
-void scatter_scalar_bitmask(std::vector<std::reference_wrapper<const scalar>> const& source,
-                            MapIterator scatter_map,
-                            size_type num_scatter_rows,
-                            std::vector<std::unique_ptr<column>>& target,
-                            rmm::cuda_stream_view stream,
-                            rmm::mr::device_memory_resource* mr)
+void scatter_scalar_bitmask_inplace(std::reference_wrapper<const scalar> const& source,
+                                    MapIterator scatter_map,
+                                    size_type num_scatter_rows,
+                                    column& target,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::mr::device_memory_resource* mr)
 {
   constexpr size_type block_size = 256;
   size_type const grid_size      = grid_1d(num_scatter_rows, block_size).num_blocks;
 
-  for (size_t i = 0; i < target.size(); ++i) {
-    auto const source_is_valid = source[i].get().is_valid(stream);
-    if (target[i]->nullable() or not source_is_valid) {
-      if (not target[i]->nullable()) {
-        // Target must have a null mask if the source is not valid
-        auto mask = detail::create_null_mask(target[i]->size(), mask_state::ALL_VALID, stream, mr);
-        target[i]->set_null_mask(std::move(mask), 0);
-      }
-
-      auto target_view = mutable_column_device_view::create(target[i]->mutable_view(), stream);
-
-      auto bitmask_kernel = source_is_valid ? marking_bitmask_kernel<true, decltype(scatter_map)>
-                                            : marking_bitmask_kernel<false, decltype(scatter_map)>;
-      bitmask_kernel<<<grid_size, block_size, 0, stream.value()>>>(
-        *target_view, scatter_map, num_scatter_rows);
+  auto const source_is_valid = source.get().is_valid(stream);
+  if (target.nullable() or not source_is_valid) {
+    if (not target.nullable()) {
+      // Target must have a null mask if the source is not valid
+      auto mask = detail::create_null_mask(target.size(), mask_state::ALL_VALID, stream, mr);
+      target.set_null_mask(std::move(mask), 0);
     }
+
+    auto target_view = mutable_column_device_view::create(target, stream);
+
+    auto bitmask_kernel = source_is_valid ? marking_bitmask_kernel<true, decltype(scatter_map)>
+                                          : marking_bitmask_kernel<false, decltype(scatter_map)>;
+    bitmask_kernel<<<grid_size, block_size, 0, stream.value()>>>(
+      *target_view, scatter_map, num_scatter_rows);
+
+    target.set_null_count(count_unset_bits(target.view().null_mask(), 0, target.size(), stream));
   }
 }
 
@@ -103,6 +105,7 @@ struct column_scalar_scatterer_impl {
   {
     CUDF_EXPECTS(source.get().type() == target.type(), "scalar and column types must match");
 
+    // make a copy of data and null mask from source
     auto result      = std::make_unique<column>(target, stream, mr);
     auto result_view = result->mutable_view();
 
@@ -117,6 +120,7 @@ struct column_scalar_scatterer_impl {
                     scatter_iter,
                     result_view.begin<Element>());
 
+    scatter_scalar_bitmask_inplace(source, scatter_iter, scatter_rows, *result, stream, mr);
     return result;
   }
 };
@@ -136,7 +140,10 @@ struct column_scalar_scatterer_impl<string_view, MapIterator> {
     auto const source_view = string_view(scalar_impl->data(), scalar_impl->size());
     auto const begin       = thrust::make_constant_iterator(source_view);
     auto const end         = begin + scatter_rows;
-    return strings::detail::scatter(begin, end, scatter_iter, target, stream, mr);
+    auto result            = strings::detail::scatter(begin, end, scatter_iter, target, stream, mr);
+
+    scatter_scalar_bitmask_inplace(source, scatter_iter, scatter_rows, *result, stream, mr);
+    return result;
   }
 };
 
@@ -149,17 +156,11 @@ struct column_scalar_scatterer_impl<list_view, MapIterator> {
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr) const
   {
-    return lists::detail::scatter(
-      source, scatter_iter, scatter_iter + scatter_rows, target, stream, mr);
-  }
-};
+    auto result =
+      lists::detail::scatter(source, scatter_iter, scatter_iter + scatter_rows, target, stream, mr);
 
-template <typename MapIterator>
-struct column_scalar_scatterer_impl<struct_view, MapIterator> {
-  template <typename... Args>
-  std::unique_ptr<column> operator()(Args&&...) const
-  {
-    CUDF_FAIL("scatter scalar to struct_view not implemented");
+    scatter_scalar_bitmask_inplace(source, scatter_iter, scatter_rows, *result, stream, mr);
+    return result;
   }
 };
 
@@ -200,10 +201,13 @@ struct column_scalar_scatterer_impl<dictionary32, MapIterator> {
     // use the keys from the matched column
     std::unique_ptr<column> keys_column(std::move(dict_target->release().children.back()));
     // create the output column
-    return make_dictionary_column(std::move(keys_column),
-                                  std::move(indices_column),
-                                  std::move(*(contents.null_mask.release())),
-                                  null_count);
+    auto result = make_dictionary_column(std::move(keys_column),
+                                         std::move(indices_column),
+                                         std::move(*(contents.null_mask.release())),
+                                         null_count);
+
+    scatter_scalar_bitmask_inplace(source, scatter_iter, scatter_rows, *result, stream, mr);
+    return result;
   }
 };
 
@@ -222,6 +226,55 @@ struct column_scalar_scatterer {
   }
 };
 
+template <typename MapIterator>
+struct column_scalar_scatterer_impl<struct_view, MapIterator> {
+  std::unique_ptr<column> operator()(std::reference_wrapper<const scalar> const& source,
+                                     MapIterator scatter_iter,
+                                     size_type scatter_rows,
+                                     column_view const& target,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr) const
+  {
+    // For each field of `source`, copy construct a scalar from the field
+    // and dispatch to the corresponding scalar scatterer
+
+    auto typed_s             = static_cast<struct_scalar const*>(&source.get());
+    size_type const n_fields = typed_s->view().num_columns();
+    CUDF_EXPECTS(n_fields == target.num_children(), "Mismatched number of fields.");
+
+    auto scatter_functor   = column_scalar_scatterer<decltype(scatter_iter)>{};
+    auto fields_iter_begin = make_counting_transform_iterator(0, [&](auto const& i) {
+      auto row_slr = get_element(typed_s->view().column(i), 0, stream);
+      return type_dispatcher<dispatch_storage_type>(row_slr->type(),
+                                                    scatter_functor,
+                                                    *row_slr,
+                                                    scatter_iter,
+                                                    scatter_rows,
+                                                    target.child(i),
+                                                    stream,
+                                                    mr);
+    });
+    std::vector<std::unique_ptr<column>> fields(fields_iter_begin, fields_iter_begin + n_fields);
+
+    // Compute null mask
+    rmm::device_buffer null_mask =
+      target.nullable() ? copy_bitmask(target, stream, mr)
+                        : create_null_mask(target.size(), mask_state::UNALLOCATED, stream, mr);
+    column null_mask_stub(data_type{type_id::STRUCT},
+                          target.size(),
+                          rmm::device_buffer{},
+                          std::move(null_mask),
+                          target.null_count());
+    scatter_scalar_bitmask_inplace(source, scatter_iter, scatter_rows, null_mask_stub, stream, mr);
+    size_type null_count = null_mask_stub.null_count();
+    auto contents        = null_mask_stub.release();
+
+    // Null mask pushdown inside factory method
+    return make_structs_column(
+      target.size(), std::move(fields), null_count, std::move(*contents.null_mask));
+  }
+};
+
 }  // namespace
 
 std::unique_ptr<table> scatter(table_view const& source,
@@ -305,8 +358,6 @@ std::unique_ptr<table> scatter(std::vector<std::reference_wrapper<const scalar>>
                                                                  mr);
                  });
 
-  scatter_scalar_bitmask(source, scatter_iter, scatter_rows, result, stream, mr);
-
   return std::make_unique<table>(std::move(result));
 }
 
diff --git a/cpp/src/copying/shift.cu b/cpp/src/copying/shift.cu
index ebeaf0e3b20..0b88545ffa5 100644
--- a/cpp/src/copying/shift.cu
+++ b/cpp/src/copying/shift.cu
@@ -20,6 +20,7 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/detail/copying.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
@@ -44,13 +45,55 @@ inline bool __device__ out_of_bounds(size_type size, size_type idx)
   return idx < 0 || idx >= size;
 }
 
+std::pair<rmm::device_buffer, size_type> create_null_mask(column_device_view const& input,
+                                                          size_type offset,
+                                                          scalar const& fill_value,
+                                                          rmm::cuda_stream_view stream,
+                                                          rmm::mr::device_memory_resource* mr)
+{
+  auto const size = input.size();
+  auto func_validity =
+    [size, offset, fill = fill_value.validity_data(), input] __device__(size_type idx) {
+      auto src_idx = idx - offset;
+      return out_of_bounds(size, src_idx) ? *fill : input.is_valid(src_idx);
+    };
+  return detail::valid_if(thrust::make_counting_iterator<size_type>(0),
+                          thrust::make_counting_iterator<size_type>(size),
+                          func_validity,
+                          stream,
+                          mr);
+}
+
 struct shift_functor {
   template <typename T, typename... Args>
-  std::enable_if_t<not cudf::is_fixed_width<T>(), std::unique_ptr<column>> operator()(Args&&...)
+  std::enable_if_t<not cudf::is_fixed_width<T>() and not std::is_same_v<cudf::string_view, T>,
+                   std::unique_ptr<column>>
+  operator()(Args&&...)
   {
     CUDF_FAIL("shift does not support non-fixed-width types.");
   }
 
+  template <typename T, typename... Args>
+  std::enable_if_t<std::is_same_v<cudf::string_view, T>, std::unique_ptr<column>> operator()(
+    column_view const& input,
+    size_type offset,
+    scalar const& fill_value,
+    rmm::cuda_stream_view stream,
+    rmm::mr::device_memory_resource* mr)
+  {
+    auto output = cudf::strings::detail::shift(
+      cudf::strings_column_view(input), offset, fill_value, stream, mr);
+
+    if (input.nullable() || not fill_value.is_valid(stream)) {
+      auto const d_input = column_device_view::create(input, stream);
+      auto mask_pair     = create_null_mask(*d_input, offset, fill_value, stream, mr);
+      output->set_null_mask(std::move(std::get<0>(mask_pair)));
+      output->set_null_count(std::get<1>(mask_pair));
+    }
+
+    return output;
+  }
+
   template <typename T>
   std::enable_if_t<cudf::is_fixed_width<T>(), std::unique_ptr<column>> operator()(
     column_view const& input,
@@ -67,29 +110,21 @@ struct shift_functor {
       detail::allocate_like(input, input.size(), mask_allocation_policy::NEVER, stream, mr);
     auto device_output = mutable_column_device_view::create(*output);
 
-    auto size        = input.size();
-    auto index_begin = thrust::make_counting_iterator<size_type>(0);
-    auto index_end   = thrust::make_counting_iterator<size_type>(size);
-
-    if (input.nullable() || not scalar.is_valid()) {
-      auto func_validity = [size,
-                            offset,
-                            fill  = scalar.validity_data(),
-                            input = *device_input] __device__(size_type idx) {
-        auto src_idx = idx - offset;
-        return out_of_bounds(size, src_idx) ? *fill : input.is_valid(src_idx);
-      };
-
-      auto mask_pair = detail::valid_if(index_begin, index_end, func_validity, stream, mr);
+    auto const scalar_is_valid = scalar.is_valid(stream);
 
+    if (input.nullable() || not scalar_is_valid) {
+      auto mask_pair = create_null_mask(*device_input, offset, fill_value, stream, mr);
       output->set_null_mask(std::move(std::get<0>(mask_pair)));
       output->set_null_count(std::get<1>(mask_pair));
     }
 
-    auto data = device_output->data<T>();
+    auto const size  = input.size();
+    auto index_begin = thrust::make_counting_iterator<size_type>(0);
+    auto index_end   = thrust::make_counting_iterator<size_type>(size);
+    auto data        = device_output->data<T>();
 
     // avoid assigning elements we know to be invalid.
-    if (not scalar.is_valid()) {
+    if (not scalar_is_valid) {
       if (offset > 0) {
         index_begin = thrust::make_counting_iterator<size_type>(offset);
         data        = data + offset;
diff --git a/cpp/src/datetime/datetime_ops.cu b/cpp/src/datetime/datetime_ops.cu
index 36c3605951e..41f3e7dcfee 100644
--- a/cpp/src/datetime/datetime_ops.cu
+++ b/cpp/src/datetime/datetime_ops.cu
@@ -19,9 +19,9 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/datetime.hpp>
+#include <cudf/detail/datetime.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/null_mask.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/traits.hpp>
@@ -127,6 +127,17 @@ struct extract_day_num_of_year {
   }
 };
 
+struct is_leap_year_op {
+  template <typename Timestamp>
+  CUDA_DEVICE_CALLABLE bool operator()(Timestamp const ts) const
+  {
+    using namespace cuda::std::chrono;
+    auto const days_since_epoch = floor<days>(ts);
+    auto const date             = year_month_day(days_since_epoch);
+    return date.year().is_leap();
+  }
+};
+
 // Apply the functor for every element/row in the input column to create the output column
 template <typename TransformFunctor, typename OutputColT>
 struct launch_functor {
@@ -357,6 +368,14 @@ std::unique_ptr<column> day_of_year(column_view const& column,
   return detail::apply_datetime_op<detail::extract_day_num_of_year, cudf::type_id::INT16>(
     column, stream, mr);
 }
+
+std::unique_ptr<column> is_leap_year(column_view const& column,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
+{
+  return apply_datetime_op<is_leap_year_op, type_id::BOOL8>(column, stream, mr);
+}
+
 }  // namespace detail
 
 std::unique_ptr<column> extract_year(column_view const& column, rmm::mr::device_memory_resource* mr)
@@ -426,5 +445,12 @@ std::unique_ptr<cudf::column> add_calendrical_months(cudf::column_view const& ti
   return detail::add_calendrical_months(
     timestamp_column, months_column, rmm::cuda_stream_default, mr);
 }
+
+std::unique_ptr<column> is_leap_year(column_view const& column, rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::is_leap_year(column, rmm::cuda_stream_default, mr);
+}
+
 }  // namespace datetime
 }  // namespace cudf
diff --git a/cpp/src/dictionary/replace.cu b/cpp/src/dictionary/replace.cu
index 1dbb844a606..37118779248 100644
--- a/cpp/src/dictionary/replace.cu
+++ b/cpp/src/dictionary/replace.cu
@@ -123,7 +123,7 @@ std::unique_ptr<column> replace_nulls(dictionary_column_view const& input,
   }
   CUDF_EXPECTS(input.keys().type() == replacement.type(), "keys must match scalar type");
 
-  // first add the replacment to the keys so only the indices need to be processed
+  // first add the replacement to the keys so only the indices need to be processed
   auto input_matched = dictionary::detail::add_keys(
     input, make_column_from_scalar(replacement, 1, stream)->view(), stream, mr);
   auto const input_view   = dictionary_column_view(input_matched->view());
diff --git a/cpp/src/dictionary/set_keys.cu b/cpp/src/dictionary/set_keys.cu
index 51ca6f5d962..2baf336bb9e 100644
--- a/cpp/src/dictionary/set_keys.cu
+++ b/cpp/src/dictionary/set_keys.cu
@@ -205,7 +205,8 @@ std::pair<std::vector<std::unique_ptr<column>>, std::vector<table_view>> match_d
       auto dict_cols = dictionary::detail::match_dictionaries(dict_views, stream, mr);
       // replace the updated_columns vector entries for the set of columns at col_idx
       auto dict_col_idx = 0;
-      for (auto& v : updated_columns) v[col_idx] = dict_cols[dict_col_idx++]->view();
+      for (auto& v : updated_columns)
+        v[col_idx] = dict_cols[dict_col_idx++]->view();
       // move the updated dictionary columns into the main output vector
       std::move(dict_cols.begin(), dict_cols.end(), std::back_inserter(dictionary_columns));
     }
diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index ff62a260d5c..87f83c6edd6 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -413,7 +413,9 @@ void sparse_to_dense_results(table_view const& keys,
                                                       row_bitmask_ptr,
                                                       stream,
                                                       mr);
-    for (auto&& agg : agg_v) { agg->finalize(finalizer); }
+    for (auto&& agg : agg_v) {
+      agg->finalize(finalizer);
+    }
   }
 }
 
diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp
index 5e202b9ef3f..4e60d8d3f7d 100644
--- a/cpp/src/groupby/sort/aggregate.cpp
+++ b/cpp/src/groupby/sort/aggregate.cpp
@@ -238,6 +238,21 @@ void aggregate_result_functor::operator()<aggregation::MEAN>(aggregation const&
   cache.add_result(col_idx, agg, std::move(result));
 };
 
+template <>
+void aggregate_result_functor::operator()<aggregation::M2>(aggregation const& agg)
+{
+  if (cache.has_result(col_idx, agg)) return;
+
+  auto const mean_agg = make_mean_aggregation();
+  operator()<aggregation::MEAN>(*mean_agg);
+  auto const mean_result = cache.get_result(col_idx, *mean_agg);
+
+  cache.add_result(
+    col_idx,
+    agg,
+    detail::group_m2(get_grouped_values(), mean_result, helper.group_labels(stream), stream, mr));
+};
+
 template <>
 void aggregate_result_functor::operator()<aggregation::VARIANCE>(aggregation const& agg)
 {
@@ -474,6 +489,35 @@ void aggregate_result_functor::operator()<aggregation::MERGE_SETS>(aggregation c
                                                        mr));
 };
 
+/**
+ * @brief Perform merging for the M2 values that correspond to the same key value.
+ *
+ * The partial results input to this aggregation is a structs column with children are columns
+ * generated by three other groupby aggregations: `COUNT_VALID`, `MEAN`, and `M2` that were
+ * performed on partitioned datasets. After distributedly computed, the results output from these
+ * aggregations are (vertically) concatenated before assembling into a structs column given as the
+ * values column for this aggregation.
+ *
+ * For recursive merging of `M2` values, the aggregations values of all input (`COUNT_VALID`,
+ * `MEAN`, and `M2`) are all merged and stored in the output of this aggregation. As such, the
+ * output will be a structs column containing children columns of merged `COUNT_VALID`, `MEAN`, and
+ * `M2` values.
+ *
+ * The values of M2 are merged following the parallel algorithm described here:
+ * https://www.wikiwand.com/en/Algorithms_for_calculating_variance#/Parallel_algorithm
+ */
+template <>
+void aggregate_result_functor::operator()<aggregation::MERGE_M2>(aggregation const& agg)
+{
+  if (cache.has_result(col_idx, agg)) { return; }
+
+  cache.add_result(
+    col_idx,
+    agg,
+    detail::group_merge_m2(
+      get_grouped_values(), helper.group_offsets(stream), helper.num_groups(stream), stream, mr));
+};
+
 }  // namespace detail
 
 // Sort-based groupby
diff --git a/cpp/src/groupby/sort/group_argmax.cu b/cpp/src/groupby/sort/group_argmax.cu
index bed64c5147a..6ce23ffc35b 100644
--- a/cpp/src/groupby/sort/group_argmax.cu
+++ b/cpp/src/groupby/sort/group_argmax.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,7 +21,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 
-#include <thrust/transform.h>
+#include <thrust/gather.h>
 
 namespace cudf {
 namespace groupby {
@@ -39,29 +39,24 @@ std::unique_ptr<column> group_argmax(column_view const& values,
                                  num_groups,
                                  group_labels,
                                  stream,
-                                 rmm::mr::get_current_device_resource());
+                                 mr);
 
   // The functor returns the index of maximum in the sorted values.
   // We need the index of maximum in the original unsorted values.
   // So use indices to gather the sort order used to sort `values`.
   // Gather map cannot be null so we make a view with the mask removed.
   // The values in data buffer of indices corresponding to null values was
-  // initialized to ARGMAX_SENTINEL which is an out of bounds index value (-1)
-  // and causes the gathered value to be null.
-  column_view null_removed_indices(
-    data_type(type_to_id<size_type>()),
-    indices->size(),
-    static_cast<void const*>(indices->view().template data<size_type>()));
-  auto result_table =
-    cudf::detail::gather(table_view({key_sort_order}),
-                         null_removed_indices,
-                         indices->nullable() ? cudf::out_of_bounds_policy::NULLIFY
-                                             : cudf::out_of_bounds_policy::DONT_CHECK,
-                         cudf::detail::negative_index_policy::NOT_ALLOWED,
-                         stream,
-                         mr);
-
-  return std::move(result_table->release()[0]);
+  // initialized to ARGMAX_SENTINEL. Using gather_if.
+  // This can't use gather because nulls in gathered column will not store ARGMAX_SENTINEL.
+  auto indices_view = indices->mutable_view();
+  thrust::gather_if(rmm::exec_policy(stream),
+                    indices_view.begin<size_type>(),    // map first
+                    indices_view.end<size_type>(),      // map last
+                    indices_view.begin<size_type>(),    // stencil
+                    key_sort_order.begin<size_type>(),  // input
+                    indices_view.begin<size_type>(),    // result
+                    [] __device__(auto i) { return (i != cudf::detail::ARGMAX_SENTINEL); });
+  return indices;
 }
 
 }  // namespace detail
diff --git a/cpp/src/groupby/sort/group_argmin.cu b/cpp/src/groupby/sort/group_argmin.cu
index ec97a609390..ab91c2c0d29 100644
--- a/cpp/src/groupby/sort/group_argmin.cu
+++ b/cpp/src/groupby/sort/group_argmin.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,7 +21,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 
-#include <thrust/transform.h>
+#include <thrust/gather.h>
 
 namespace cudf {
 namespace groupby {
@@ -39,29 +39,24 @@ std::unique_ptr<column> group_argmin(column_view const& values,
                                  num_groups,
                                  group_labels,
                                  stream,
-                                 rmm::mr::get_current_device_resource());
+                                 mr);
 
   // The functor returns the index of minimum in the sorted values.
   // We need the index of minimum in the original unsorted values.
   // So use indices to gather the sort order used to sort `values`.
-  // Gather map cannot be null so we make a view with the mask removed.
   // The values in data buffer of indices corresponding to null values was
-  // initialized to ARGMIN_SENTINEL which is an out of bounds index value (-1)
-  // and causes the gathered value to be null.
-  column_view null_removed_indices(
-    data_type(type_to_id<size_type>()),
-    indices->size(),
-    static_cast<void const*>(indices->view().template data<size_type>()));
-  auto result_table =
-    cudf::detail::gather(table_view({key_sort_order}),
-                         null_removed_indices,
-                         indices->nullable() ? cudf::out_of_bounds_policy::NULLIFY
-                                             : cudf::out_of_bounds_policy::DONT_CHECK,
-                         cudf::detail::negative_index_policy::NOT_ALLOWED,
-                         stream,
-                         mr);
+  // initialized to ARGMIN_SENTINEL. Using gather_if.
+  // This can't use gather because nulls in gathered column will not store ARGMIN_SENTINEL.
+  auto indices_view = indices->mutable_view();
+  thrust::gather_if(rmm::exec_policy(stream),
+                    indices_view.begin<size_type>(),    // map first
+                    indices_view.end<size_type>(),      // map last
+                    indices_view.begin<size_type>(),    // stencil
+                    key_sort_order.begin<size_type>(),  // input
+                    indices_view.begin<size_type>(),    // result
+                    [] __device__(auto i) { return (i != cudf::detail::ARGMIN_SENTINEL); });
 
-  return std::move(result_table->release()[0]);
+  return indices;
 }
 
 }  // namespace detail
diff --git a/cpp/src/groupby/sort/group_collect.cu b/cpp/src/groupby/sort/group_collect.cu
index 1e6a681af94..a30d4639af8 100644
--- a/cpp/src/groupby/sort/group_collect.cu
+++ b/cpp/src/groupby/sort/group_collect.cu
@@ -41,11 +41,11 @@ namespace detail {
  * @return Pair of null-eliminated grouped values and corresponding offsets
  */
 std::pair<std::unique_ptr<column>, std::unique_ptr<column>> purge_null_entries(
-  column_view const &values,
-  column_view const &offsets,
+  column_view const& values,
+  column_view const& offsets,
   size_type num_groups,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource *mr)
+  rmm::mr::device_memory_resource* mr)
 {
   auto values_device_view = column_device_view::create(values, stream);
 
@@ -81,12 +81,12 @@ std::pair<std::unique_ptr<column>, std::unique_ptr<column>> purge_null_entries(
     std::move(null_purged_values), std::move(null_purged_offsets));
 }
 
-std::unique_ptr<column> group_collect(column_view const &values,
+std::unique_ptr<column> group_collect(column_view const& values,
                                       cudf::device_span<size_type const> group_offsets,
                                       size_type num_groups,
                                       null_policy null_handling,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource *mr)
+                                      rmm::mr::device_memory_resource* mr)
 {
   auto [child_column,
         offsets_column] = [null_handling, num_groups, &values, &group_offsets, stream, mr] {
diff --git a/cpp/src/groupby/sort/group_m2.cu b/cpp/src/groupby/sort/group_m2.cu
new file mode 100644
index 00000000000..a72f6c6f647
--- /dev/null
+++ b/cpp/src/groupby/sort/group_m2.cu
@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/detail/null_mask.hpp>
+#include <cudf/dictionary/detail/iterator.cuh>
+#include <cudf/dictionary/dictionary_column_view.hpp>
+#include <cudf/utilities/span.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/reduce.h>
+
+namespace cudf {
+namespace groupby {
+namespace detail {
+namespace {
+
+template <typename ResultType, typename Iterator>
+struct m2_transform {
+  column_device_view const d_values;
+  Iterator const values_iter;
+  ResultType const* d_means;
+  size_type const* d_group_labels;
+
+  __device__ ResultType operator()(size_type const idx) const noexcept
+  {
+    if (d_values.is_null(idx)) { return 0.0; }
+
+    auto const x         = static_cast<ResultType>(values_iter[idx]);
+    auto const group_idx = d_group_labels[idx];
+    auto const mean      = d_means[group_idx];
+    auto const diff      = x - mean;
+    return diff * diff;
+  }
+};
+
+template <typename ResultType, typename Iterator>
+void compute_m2_fn(column_device_view const& values,
+                   Iterator values_iter,
+                   cudf::device_span<size_type const> group_labels,
+                   ResultType const* d_means,
+                   ResultType* d_result,
+                   rmm::cuda_stream_view stream)
+{
+  auto const var_iter = cudf::detail::make_counting_transform_iterator(
+    size_type{0},
+    m2_transform<ResultType, decltype(values_iter)>{
+      values, values_iter, d_means, group_labels.data()});
+
+  thrust::reduce_by_key(rmm::exec_policy(stream),
+                        group_labels.begin(),
+                        group_labels.end(),
+                        var_iter,
+                        thrust::make_discard_iterator(),
+                        d_result);
+}
+
+struct m2_functor {
+  template <typename T>
+  std::enable_if_t<std::is_arithmetic<T>::value, std::unique_ptr<column>> operator()(
+    column_view const& values,
+    column_view const& group_means,
+    cudf::device_span<size_type const> group_labels,
+    rmm::cuda_stream_view stream,
+    rmm::mr::device_memory_resource* mr)
+  {
+    using result_type = cudf::detail::target_type_t<T, aggregation::Kind::M2>;
+    auto result       = make_numeric_column(data_type(type_to_id<result_type>()),
+                                      group_means.size(),
+                                      mask_state::UNALLOCATED,
+                                      stream,
+                                      mr);
+
+    auto const values_dv_ptr = column_device_view::create(values, stream);
+    auto const d_values      = *values_dv_ptr;
+    auto const d_means       = group_means.data<result_type>();
+    auto const d_result      = result->mutable_view().data<result_type>();
+
+    if (!cudf::is_dictionary(values.type())) {
+      auto const values_iter = d_values.begin<T>();
+      compute_m2_fn(d_values, values_iter, group_labels, d_means, d_result, stream);
+    } else {
+      auto const values_iter =
+        cudf::dictionary::detail::make_dictionary_iterator<T>(*values_dv_ptr);
+      compute_m2_fn(d_values, values_iter, group_labels, d_means, d_result, stream);
+    }
+
+    // M2 column values should have the same bitmask as means's.
+    if (group_means.nullable()) {
+      result->set_null_mask(cudf::detail::copy_bitmask(group_means, stream, mr),
+                            group_means.null_count());
+    }
+
+    return result;
+  }
+
+  template <typename T, typename... Args>
+  std::enable_if_t<!std::is_arithmetic<T>::value, std::unique_ptr<column>> operator()(Args&&...)
+  {
+    CUDF_FAIL("Only numeric types are supported in M2 groupby aggregation");
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<column> group_m2(column_view const& values,
+                                 column_view const& group_means,
+                                 cudf::device_span<size_type const> group_labels,
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource* mr)
+{
+  auto values_type = cudf::is_dictionary(values.type())
+                       ? dictionary_column_view(values).keys().type()
+                       : values.type();
+
+  return type_dispatcher(values_type, m2_functor{}, values, group_means, group_labels, stream, mr);
+}
+
+}  // namespace detail
+}  // namespace groupby
+}  // namespace cudf
diff --git a/cpp/src/groupby/sort/group_merge_m2.cu b/cpp/src/groupby/sort/group_merge_m2.cu
new file mode 100644
index 00000000000..4e2a5b68abc
--- /dev/null
+++ b/cpp/src/groupby/sort/group_merge_m2.cu
@@ -0,0 +1,196 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/detail/valid_if.cuh>
+#include <cudf/dictionary/detail/iterator.cuh>
+#include <cudf/structs/structs_column_view.hpp>
+#include <cudf/utilities/span.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/reduce.h>
+
+namespace cudf {
+namespace groupby {
+namespace detail {
+namespace {
+/**
+ * @brief Struct to store partial results for merging.
+ */
+template <class result_type>
+struct partial_result {
+  size_type count;
+  result_type mean;
+  result_type M2;
+};
+
+/**
+ * @brief Functor to accumulate (merge) all partial results corresponding to the same key into a
+ * final result storing in a member variable. It performs merging for the partial results of
+ * `COUNT_VALID`, `MEAN`, and `M2` at the same time.
+ */
+template <class result_type>
+struct accumulate_fn {
+  partial_result<result_type> merge_vals;
+
+  void __device__ operator()(partial_result<result_type> const& partial_vals) noexcept
+  {
+    if (partial_vals.count == 0) { return; }
+
+    auto const n_ab  = merge_vals.count + partial_vals.count;
+    auto const delta = partial_vals.mean - merge_vals.mean;
+    merge_vals.M2 += partial_vals.M2 + (delta * delta) *
+                                         static_cast<result_type>(merge_vals.count) *
+                                         static_cast<result_type>(partial_vals.count) / n_ab;
+    merge_vals.mean =
+      (merge_vals.mean * merge_vals.count + partial_vals.mean * partial_vals.count) / n_ab;
+    merge_vals.count = n_ab;
+  }
+};
+
+/**
+ * @brief Functor to merge partial results of `COUNT_VALID`, `MEAN`, and `M2` aggregations
+ * for a given group (key) index.
+ */
+template <class result_type>
+struct merge_fn {
+  size_type const* const d_offsets;
+  size_type const* const d_counts;
+  result_type const* const d_means;
+  result_type const* const d_M2s;
+
+  auto __device__ operator()(size_type const group_idx) noexcept
+  {
+    auto const start_idx = d_offsets[group_idx], end_idx = d_offsets[group_idx + 1];
+
+    // This case should never happen, because all groups are non-empty as the results of
+    // aggregation. Here we just to make sure we cover this case.
+    if (start_idx == end_idx) {
+      return thrust::make_tuple(size_type{0}, result_type{0}, result_type{0}, int8_t{0});
+    }
+
+    // If `(n = d_counts[idx]) > 0` then `d_means[idx] != null` and `d_M2s[idx] != null`.
+    // Otherwise (`n == 0`), these value (mean and M2) will always be nulls.
+    // In such cases, reading `mean` and `M2` from memory will return garbage values.
+    // By setting these values to zero when `n == 0`, we can safely merge the all-zero tuple without
+    // affecting the final result.
+    auto get_partial_result = [&] __device__(size_type idx) {
+      {
+        auto const n = d_counts[idx];
+        return n > 0 ? partial_result<result_type>{n, d_means[idx], d_M2s[idx]}
+                     : partial_result<result_type>{size_type{0}, result_type{0}, result_type{0}};
+      };
+    };
+
+    // Firstly, store tuple(count, mean, M2) of the first partial result in an accumulator.
+    auto accumulator = accumulate_fn<result_type>{get_partial_result(start_idx)};
+
+    // Then, accumulate (merge) the remaining partial results into that accumulator.
+    for (auto idx = start_idx + 1; idx < end_idx; ++idx) {
+      accumulator(get_partial_result(idx));
+    }
+
+    // Get the final result after merging.
+    auto const& merge_vals = accumulator.merge_vals;
+
+    // If there are all nulls in the partial results (i.e., sum of all valid counts is
+    // zero), then the output is a null.
+    auto const is_valid = int8_t{merge_vals.count > 0};
+
+    return thrust::make_tuple(merge_vals.count, merge_vals.mean, merge_vals.M2, is_valid);
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<column> group_merge_m2(column_view const& values,
+                                       cudf::device_span<size_type const> group_offsets,
+                                       size_type num_groups,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::mr::device_memory_resource* mr)
+{
+  CUDF_EXPECTS(values.type().id() == type_id::STRUCT,
+               "Input to `group_merge_m2` must be a structs column.");
+  CUDF_EXPECTS(values.num_children() == 3,
+               "Input to `group_merge_m2` must be a structs column having 3 children columns.");
+
+  using result_type = id_to_type<type_id::FLOAT64>;
+  static_assert(
+    std::is_same_v<cudf::detail::target_type_t<result_type, aggregation::Kind::M2>, result_type>);
+  CUDF_EXPECTS(values.child(0).type().id() == type_id::INT32 &&
+                 values.child(1).type().id() == type_to_id<result_type>() &&
+                 values.child(2).type().id() == type_to_id<result_type>(),
+               "Input to `group_merge_m2` must be a structs column having children columns "
+               "containing tuples of (M2_value, mean, valid_count).");
+
+  auto result_counts = make_numeric_column(
+    data_type(type_to_id<size_type>()), num_groups, mask_state::UNALLOCATED, stream, mr);
+  auto result_means = make_numeric_column(
+    data_type(type_to_id<result_type>()), num_groups, mask_state::UNALLOCATED, stream, mr);
+  auto result_M2s = make_numeric_column(
+    data_type(type_to_id<result_type>()), num_groups, mask_state::UNALLOCATED, stream, mr);
+  auto validities = rmm::device_uvector<int8_t>(num_groups, stream);
+
+  // Perform merging for all the aggregations. Their output (and their validity data) are written
+  // out concurrently through an output zip iterator.
+  using iterator_tuple  = thrust::tuple<size_type*, result_type*, result_type*, int8_t*>;
+  using output_iterator = thrust::zip_iterator<iterator_tuple>;
+  auto const out_iter =
+    output_iterator{thrust::make_tuple(result_counts->mutable_view().template data<size_type>(),
+                                       result_means->mutable_view().template data<result_type>(),
+                                       result_M2s->mutable_view().template data<result_type>(),
+                                       validities.begin())};
+
+  auto const count_valid = values.child(0);
+  auto const mean_values = values.child(1);
+  auto const M2_values   = values.child(2);
+  auto const iter        = thrust::make_counting_iterator<size_type>(0);
+
+  auto const fn = merge_fn<result_type>{group_offsets.begin(),
+                                        count_valid.template begin<size_type>(),
+                                        mean_values.template begin<result_type>(),
+                                        M2_values.template begin<result_type>()};
+  thrust::transform(rmm::exec_policy(stream), iter, iter + num_groups, out_iter, fn);
+
+  // Generate bitmask for the output.
+  // Only mean and M2 values can be nullable. Count column must be non-nullable.
+  auto [null_mask, null_count] = cudf::detail::valid_if(
+    validities.begin(), validities.end(), thrust::identity<int8_t>{}, stream, mr);
+  if (null_count > 0) {
+    result_means->set_null_mask(null_mask, null_count);           // copy null_mask
+    result_M2s->set_null_mask(std::move(null_mask), null_count);  // take over null_mask
+  }
+
+  // Output is a structs column containing the merged values of `COUNT_VALID`, `MEAN`, and `M2`.
+  std::vector<std::unique_ptr<column>> out_columns;
+  out_columns.emplace_back(std::move(result_counts));
+  out_columns.emplace_back(std::move(result_means));
+  out_columns.emplace_back(std::move(result_M2s));
+  auto result = cudf::make_structs_column(
+    num_groups, std::move(out_columns), 0, rmm::device_buffer{0, stream, mr}, stream, mr);
+
+  return result;
+}
+
+}  // namespace detail
+}  // namespace groupby
+}  // namespace cudf
diff --git a/cpp/src/groupby/sort/group_nth_element.cu b/cpp/src/groupby/sort/group_nth_element.cu
index c3d874f3b33..e7dc57f6c93 100644
--- a/cpp/src/groupby/sort/group_nth_element.cu
+++ b/cpp/src/groupby/sort/group_nth_element.cu
@@ -33,15 +33,15 @@
 namespace cudf {
 namespace groupby {
 namespace detail {
-std::unique_ptr<column> group_nth_element(column_view const &values,
-                                          column_view const &group_sizes,
+std::unique_ptr<column> group_nth_element(column_view const& values,
+                                          column_view const& group_sizes,
                                           cudf::device_span<size_type const> group_labels,
                                           cudf::device_span<size_type const> group_offsets,
                                           size_type num_groups,
                                           size_type n,
                                           null_policy null_handling,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource *mr)
+                                          rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(static_cast<size_t>(values.size()) == group_labels.size(),
                "Size of values column should be same as that of group labels");
diff --git a/cpp/src/groupby/sort/group_reductions.hpp b/cpp/src/groupby/sort/group_reductions.hpp
index 3390af29330..2770162da2d 100644
--- a/cpp/src/groupby/sort/group_reductions.hpp
+++ b/cpp/src/groupby/sort/group_reductions.hpp
@@ -217,6 +217,30 @@ std::unique_ptr<column> group_count_all(cudf::device_span<size_type const> group
                                         rmm::cuda_stream_view stream,
                                         rmm::mr::device_memory_resource* mr);
 
+/**
+ * @brief Internal API to calculate sum of squares of differences from means.
+ *
+ * If there are only nulls in the group, the output value of that group will be null.
+ *
+ * @code{.pseudo}
+ * values        = [2, 1, 4, -1, -2, <NA>, 4, <NA>]
+ * group_labels  = [0, 0, 0,  1,  1,    2, 2,    3]
+ * group_means   = [2.333333, -1.5, 4.0, <NA>]
+ * group_m2(...) = [4.666666,  1.0, 0.0, <NA>]
+ * @endcode
+ *
+ * @param values Grouped values to compute M2 values
+ * @param group_means Pre-computed groupwise MEAN
+ * @param group_labels ID of group corresponding value in @p values belongs to
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
+std::unique_ptr<column> group_m2(column_view const& values,
+                                 column_view const& group_means,
+                                 cudf::device_span<size_type const> group_labels,
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource* mr);
+
 /**
  * @brief Internal API to calculate groupwise variance
  *
@@ -392,6 +416,32 @@ std::unique_ptr<column> group_merge_lists(column_view const& values,
                                           rmm::cuda_stream_view stream,
                                           rmm::mr::device_memory_resource* mr);
 
+/**
+ * @brief Internal API to merge grouped M2 values corresponding to the same key.
+ *
+ * The values of M2 are merged following the parallel algorithm described here:
+ * `https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm`
+ *
+ * Merging M2 values require accessing to partial M2 values, means, and valid counts. Thus, the
+ * input to this aggregation need to be a structs column containing tuples of 3 values
+ * `(valid_count, mean, M2)`.
+ *
+ * This aggregation not only merges the partial results of `M2` but also merged all the partial
+ * results of input aggregations (`COUNT_VALID`, `MEAN`, and `M2`). As such, the output will be a
+ * structs column containing children columns of merged `COUNT_VALID`, `MEAN`, and `M2` values.
+ *
+ * @param values Grouped values (tuples of values `(valid_count, mean, M2)`) to merge.
+ * @param group_offsets Offsets of groups' starting points within @p values.
+ * @param num_groups Number of groups.
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
+std::unique_ptr<column> group_merge_m2(column_view const& values,
+                                       cudf::device_span<size_type const> group_offsets,
+                                       size_type num_groups,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::mr::device_memory_resource* mr);
+
 /** @endinternal
  *
  */
diff --git a/cpp/src/hash/concurrent_unordered_multimap.cuh b/cpp/src/hash/concurrent_unordered_multimap.cuh
index 071214e80b0..2b92c9142ca 100644
--- a/cpp/src/hash/concurrent_unordered_multimap.cuh
+++ b/cpp/src/hash/concurrent_unordered_multimap.cuh
@@ -239,7 +239,7 @@ class concurrent_unordered_multimap {
    * @param[in] precomputed_hash A flag indicating whether or not a precomputed
    * hash value is passed in
    * @param[in] precomputed_hash_value A precomputed hash value to use for
-   * determing the write location of the key into the hash map instead of
+   * determining the write location of the key into the hash map instead of
    * computing the the hash value directly from the key
    * @tparam hash_value_type The datatype of the hash value
    *
@@ -284,7 +284,7 @@ class concurrent_unordered_multimap {
    * @param[in] precomputed_hash A flag indicating whether or not a precomputed
    * hash value is passed in
    * @param[in] precomputed_hash_value A precomputed hash value to use for
-   * determing the write location of the key into the hash map instead of
+   * determining the write location of the key into the hash map instead of
    * computing the the hash value directly from the key
    * @param[in] keys_are_equal An optional functor for comparing if two keys are
    * equal
@@ -375,7 +375,7 @@ class concurrent_unordered_multimap {
    * @param[in] precomputed_hash A flag indicating whether or not a precomputed
    * hash value is passed in
    * @param[in] precomputed_hash_value A precomputed hash value to use for
-   * determing the write location of the key into the hash map instead of
+   * determining the write location of the key into the hash map instead of
    * computing the the hash value directly from the key
    * @param[in] keys_are_equal An optional functor for comparing if two keys are
    * equal
@@ -423,7 +423,7 @@ class concurrent_unordered_multimap {
    * @param[in] precomputed_hash A flag indicating whether or not a precomputed
    * hash value is passed in
    * @param[in] precomputed_hash_value A precomputed hash value to use for
-   * determing the write location of the key into the hash map instead of
+   * determining the write location of the key into the hash map instead of
    * computing the the hash value directly from the key
    * @param[in] keys_are_equal An optional functor for comparing if two keys are
    * equal
diff --git a/cpp/src/hash/managed.cuh b/cpp/src/hash/managed.cuh
index c8d3178b1d8..c6cc60a6917 100644
--- a/cpp/src/hash/managed.cuh
+++ b/cpp/src/hash/managed.cuh
@@ -20,15 +20,15 @@
 #include <new>
 
 struct managed {
-  static void *operator new(size_t n)
+  static void* operator new(size_t n)
   {
-    void *ptr          = 0;
+    void* ptr          = 0;
     cudaError_t result = cudaMallocManaged(&ptr, n);
     if (cudaSuccess != result || 0 == ptr) throw std::bad_alloc();
     return ptr;
   }
 
-  static void operator delete(void *ptr) noexcept
+  static void operator delete(void* ptr) noexcept
   {
     auto const free_result = cudaFree(ptr);
     assert(free_result == cudaSuccess);
diff --git a/cpp/src/hash/unordered_multiset.cuh b/cpp/src/hash/unordered_multiset.cuh
index 645d9bc5185..d28bf6f6fe5 100644
--- a/cpp/src/hash/unordered_multiset.cuh
+++ b/cpp/src/hash/unordered_multiset.cuh
@@ -38,8 +38,8 @@ template <typename Element,
 class unordered_multiset_device_view {
  public:
   unordered_multiset_device_view(size_type hash_size,
-                                 const size_type *hash_begin,
-                                 const Element *hash_data)
+                                 const size_type* hash_begin,
+                                 const Element* hash_data)
     : hash_size{hash_size}, hash_begin{hash_begin}, hash_data{hash_data}, hasher(), equals()
   {
   }
@@ -59,8 +59,8 @@ class unordered_multiset_device_view {
   Hasher hasher;
   Equality equals;
   size_type hash_size;
-  const size_type *hash_begin;
-  const Element *hash_data;
+  const size_type* hash_begin;
+  const Element* hash_data;
 };
 
 /*
@@ -74,7 +74,7 @@ class unordered_multiset {
   /**
    * @brief Factory to construct a new unordered_multiset
    */
-  static unordered_multiset<Element> create(column_view const &col, rmm::cuda_stream_view stream)
+  static unordered_multiset<Element> create(column_view const& col, rmm::cuda_stream_view stream)
   {
     auto d_column = column_device_view::create(col, stream);
     auto d_col    = *d_column;
@@ -86,9 +86,9 @@ class unordered_multiset {
     auto hash_data = rmm::device_uvector<Element>(d_col.size(), stream);
 
     Hasher hasher;
-    size_type *d_hash_bins_start = hash_bins_start.data();
-    size_type *d_hash_bins_end   = hash_bins_end.data();
-    Element *d_hash_data         = hash_data.data();
+    size_type* d_hash_bins_start = hash_bins_start.data();
+    size_type* d_hash_bins_end   = hash_bins_end.data();
+    Element* d_hash_data         = hash_data.data();
 
     thrust::for_each(rmm::exec_policy(stream),
                      thrust::make_counting_iterator<size_type>(0),
@@ -134,8 +134,8 @@ class unordered_multiset {
 
  private:
   unordered_multiset(size_type size,
-                     rmm::device_uvector<size_type> &&hash_bins,
-                     rmm::device_uvector<Element> &&hash_data)
+                     rmm::device_uvector<size_type>&& hash_bins,
+                     rmm::device_uvector<Element>&& hash_data)
     : size{size}, hash_bins{std::move(hash_bins)}, hash_data{std::move(hash_data)}
   {
   }
diff --git a/cpp/src/interop/from_arrow.cu b/cpp/src/interop/from_arrow.cu
index 28fc2ae9d4f..917a5b1ac9c 100644
--- a/cpp/src/interop/from_arrow.cu
+++ b/cpp/src/interop/from_arrow.cu
@@ -94,7 +94,7 @@ namespace {
  */
 struct dispatch_to_cudf_column {
   /**
-   * @brief Returns mask from an array withut any offsets.
+   * @brief Returns mask from an array without any offsets.
    */
   std::unique_ptr<rmm::device_buffer> get_mask_buffer(arrow::Array const& array,
                                                       rmm::cuda_stream_view stream,
diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu
index f8fcf03a77e..3cd515e9981 100644
--- a/cpp/src/interop/to_arrow.cu
+++ b/cpp/src/interop/to_arrow.cu
@@ -96,7 +96,7 @@ std::shared_ptr<arrow::Buffer> fetch_mask_buffer(column_view input_view,
  */
 struct dispatch_to_arrow {
   /**
-   * @brief Creates vector Arrays from given cudf column childrens
+   * @brief Creates vector Arrays from given cudf column children
    */
   std::vector<std::shared_ptr<arrow::Array>> fetch_child_array(
     column_view input_view,
diff --git a/cpp/src/io/avro/avro.cpp b/cpp/src/io/avro/avro.cpp
index 8f0599cdd5b..7227d7e4e0b 100644
--- a/cpp/src/io/avro/avro.cpp
+++ b/cpp/src/io/avro/avro.cpp
@@ -49,7 +49,7 @@ std::string container::get_encoded()
     return (len & 1) || (m_cur >= m_end) ? 0
                                          : std::min(len >> 1, static_cast<uint64_t>(m_end - m_cur));
   }();
-  auto const s = reinterpret_cast<char const *>(m_cur);
+  auto const s = reinterpret_cast<char const*>(m_cur);
   m_cur += len;
   return std::string(s, len);
 }
@@ -63,7 +63,7 @@ std::string container::get_encoded()
  *
  * @returns true if successful, false if error
  */
-bool container::parse(file_metadata *md, size_t max_num_rows, size_t first_row)
+bool container::parse(file_metadata* md, size_t max_num_rows, size_t first_row)
 {
   constexpr uint32_t avro_magic = (('O' << 0) | ('b' << 8) | ('j' << 16) | (0x01 << 24));
   uint32_t sig4, max_block_size;
@@ -195,7 +195,7 @@ enum {
  *
  * @returns true if successful, false if error
  */
-bool schema_parser::parse(std::vector<schema_entry> &schema, const std::string &json_str)
+bool schema_parser::parse(std::vector<schema_entry>& schema, const std::string& json_str)
 {
   // Empty schema
   if (json_str == "[]") return true;
@@ -361,8 +361,8 @@ bool schema_parser::parse(std::vector<schema_entry> &schema, const std::string &
 std::string schema_parser::get_str()
 {
   std::string s;
-  const char *start = m_cur;
-  const char *cur   = start;
+  const char* start = m_cur;
+  const char* cur   = start;
   while (cur < m_end && *cur++ != '"')
     ;
   int32_t len = static_cast<int32_t>(cur - start - 1);
diff --git a/cpp/src/io/avro/avro.h b/cpp/src/io/avro/avro.h
index 13f5e4ecb3c..fe8f5634815 100644
--- a/cpp/src/io/avro/avro.h
+++ b/cpp/src/io/avro/avro.h
@@ -82,16 +82,16 @@ class schema_parser {
 
  public:
   schema_parser() {}
-  bool parse(std::vector<schema_entry> &schema, const std::string &str);
+  bool parse(std::vector<schema_entry>& schema, const std::string& str);
 
  protected:
   bool more_data() const { return (m_cur < m_end); }
   std::string get_str();
 
  protected:
-  const char *m_base;
-  const char *m_cur;
-  const char *m_end;
+  const char* m_base;
+  const char* m_cur;
+  const char* m_end;
 };
 
 /**
@@ -99,7 +99,7 @@ class schema_parser {
  */
 class container {
  public:
-  container(uint8_t const *base, size_t len) noexcept : m_base{base}, m_cur{base}, m_end{base + len}
+  container(uint8_t const* base, size_t len) noexcept : m_base{base}, m_cur{base}, m_end{base + len}
   {
   }
 
@@ -119,12 +119,12 @@ class container {
   T get_encoded();
 
  public:
-  bool parse(file_metadata *md, size_t max_num_rows = 0x7fffffff, size_t first_row = 0);
+  bool parse(file_metadata* md, size_t max_num_rows = 0x7fffffff, size_t first_row = 0);
 
  protected:
-  const uint8_t *m_base;
-  const uint8_t *m_cur;
-  const uint8_t *m_end;
+  const uint8_t* m_base;
+  const uint8_t* m_cur;
+  const uint8_t* m_end;
 };
 
 }  // namespace avro
diff --git a/cpp/src/io/avro/avro_gpu.cu b/cpp/src/io/avro/avro_gpu.cu
index ebd7f51a08a..6fabcf00b8f 100644
--- a/cpp/src/io/avro/avro_gpu.cu
+++ b/cpp/src/io/avro/avro_gpu.cu
@@ -32,7 +32,7 @@ constexpr int max_shared_schema_len = 1000;
  * Avro varint encoding - see
  * https://avro.apache.org/docs/1.2.0/spec.html#binary_encoding
  */
-static inline int64_t __device__ avro_decode_zigzag_varint(const uint8_t *&cur, const uint8_t *end)
+static inline int64_t __device__ avro_decode_zigzag_varint(const uint8_t*& cur, const uint8_t* end)
 {
   uint64_t u = 0;
   if (cur < end) {
@@ -65,13 +65,13 @@ static inline int64_t __device__ avro_decode_zigzag_varint(const uint8_t *&cur,
  *
  * @return data pointer at the end of the row (start of next row)
  */
-static const uint8_t *__device__ avro_decode_row(const schemadesc_s *schema,
-                                                 schemadesc_s *schema_g,
+static const uint8_t* __device__ avro_decode_row(const schemadesc_s* schema,
+                                                 schemadesc_s* schema_g,
                                                  uint32_t schema_len,
                                                  size_t row,
                                                  size_t max_rows,
-                                                 const uint8_t *cur,
-                                                 const uint8_t *end,
+                                                 const uint8_t* cur,
+                                                 const uint8_t* end,
                                                  device_span<string_index_pair> global_dictionary)
 {
   uint32_t array_start = 0, array_repeat_count = 0;
@@ -96,11 +96,11 @@ static const uint8_t *__device__ avro_decode_row(const schemadesc_s *schema,
       skip = skip_after;
     }
 
-    void *dataptr = schema[i].dataptr;
+    void* dataptr = schema[i].dataptr;
     switch (kind) {
       case type_null:
         if (dataptr != nullptr && row < max_rows) {
-          atomicAnd(static_cast<uint32_t *>(dataptr) + (row >> 5), ~(1 << (row & 0x1f)));
+          atomicAnd(static_cast<uint32_t*>(dataptr) + (row >> 5), ~(1 << (row & 0x1f)));
           atomicAdd(&schema_g[i].count, 1);
         }
         break;
@@ -113,13 +113,13 @@ static const uint8_t *__device__ avro_decode_row(const schemadesc_s *schema,
         int64_t v = avro_decode_zigzag_varint(cur, end);
         if (kind == type_int) {
           if (dataptr != nullptr && row < max_rows) {
-            static_cast<int32_t *>(dataptr)[row] = static_cast<int32_t>(v);
+            static_cast<int32_t*>(dataptr)[row] = static_cast<int32_t>(v);
           }
         } else if (kind == type_long) {
-          if (dataptr != nullptr && row < max_rows) { static_cast<int64_t *>(dataptr)[row] = v; }
+          if (dataptr != nullptr && row < max_rows) { static_cast<int64_t*>(dataptr)[row] = v; }
         } else {  // string or enum
           size_t count    = 0;
-          const char *ptr = 0;
+          const char* ptr = 0;
           if (kind == type_enum) {  // dictionary
             size_t idx = schema[i].count + v;
             if (idx < global_dictionary.size()) {
@@ -127,13 +127,13 @@ static const uint8_t *__device__ avro_decode_row(const schemadesc_s *schema,
               count = global_dictionary[idx].second;
             }
           } else if (v >= 0 && cur + v <= end) {  // string
-            ptr   = reinterpret_cast<const char *>(cur);
+            ptr   = reinterpret_cast<const char*>(cur);
             count = (size_t)v;
             cur += count;
           }
           if (dataptr != nullptr && row < max_rows) {
-            static_cast<string_index_pair *>(dataptr)[row].first  = ptr;
-            static_cast<string_index_pair *>(dataptr)[row].second = count;
+            static_cast<string_index_pair*>(dataptr)[row].first  = ptr;
+            static_cast<string_index_pair*>(dataptr)[row].second = count;
           }
         }
       } break;
@@ -147,7 +147,7 @@ static const uint8_t *__device__ avro_decode_row(const schemadesc_s *schema,
           } else {
             v = 0;
           }
-          static_cast<uint32_t *>(dataptr)[row] = v;
+          static_cast<uint32_t*>(dataptr)[row] = v;
         } else {
           cur += 4;
         }
@@ -162,7 +162,7 @@ static const uint8_t *__device__ avro_decode_row(const schemadesc_s *schema,
           } else {
             v = 0;
           }
-          static_cast<uint64_t *>(dataptr)[row] = v;
+          static_cast<uint64_t*>(dataptr)[row] = v;
         } else {
           cur += 8;
         }
@@ -170,8 +170,8 @@ static const uint8_t *__device__ avro_decode_row(const schemadesc_s *schema,
 
       case type_boolean:
         if (dataptr != nullptr && row < max_rows) {
-          uint8_t v                            = (cur < end) ? *cur : 0;
-          static_cast<uint8_t *>(dataptr)[row] = (v) ? 1 : 0;
+          uint8_t v                           = (cur < end) ? *cur : 0;
+          static_cast<uint8_t*>(dataptr)[row] = (v) ? 1 : 0;
         }
         cur++;
         break;
@@ -228,10 +228,10 @@ static const uint8_t *__device__ avro_decode_row(const schemadesc_s *schema,
  */
 // blockDim {32,num_warps,1}
 extern "C" __global__ void __launch_bounds__(num_warps * 32, 2)
-  gpuDecodeAvroColumnData(block_desc_s *blocks,
-                          schemadesc_s *schema_g,
+  gpuDecodeAvroColumnData(block_desc_s* blocks,
+                          schemadesc_s* schema_g,
                           device_span<string_index_pair> global_dictionary,
-                          const uint8_t *avro_data,
+                          const uint8_t* avro_data,
                           uint32_t num_blocks,
                           uint32_t schema_len,
                           uint32_t min_row_size,
@@ -241,8 +241,8 @@ extern "C" __global__ void __launch_bounds__(num_warps * 32, 2)
   __shared__ __align__(8) schemadesc_s g_shared_schema[max_shared_schema_len];
   __shared__ __align__(8) block_desc_s blk_g[num_warps];
 
-  schemadesc_s *schema;
-  block_desc_s *const blk = &blk_g[threadIdx.y];
+  schemadesc_s* schema;
+  block_desc_s* const blk = &blk_g[threadIdx.y];
   uint32_t block_id       = blockIdx.x * num_warps + threadIdx.y;
   size_t cur_row;
   uint32_t rows_remaining;
@@ -267,7 +267,7 @@ extern "C" __global__ void __launch_bounds__(num_warps * 32, 2)
   end            = cur + blk->size;
   while (rows_remaining > 0 && cur < end) {
     uint32_t nrows;
-    const uint8_t *start = cur;
+    const uint8_t* start = cur;
 
     if (cur_row > first_row + max_rows) break;
     if (cur + min_row_size * rows_remaining == end) {
@@ -311,10 +311,10 @@ extern "C" __global__ void __launch_bounds__(num_warps * 32, 2)
  * @param[in] min_row_size Minimum size in bytes of a row
  * @param[in] stream CUDA stream to use, default 0
  */
-void DecodeAvroColumnData(block_desc_s *blocks,
-                          schemadesc_s *schema,
+void DecodeAvroColumnData(block_desc_s* blocks,
+                          schemadesc_s* schema,
                           device_span<string_index_pair> global_dictionary,
-                          const uint8_t *avro_data,
+                          const uint8_t* avro_data,
                           uint32_t num_blocks,
                           uint32_t schema_len,
                           size_t max_rows,
diff --git a/cpp/src/io/avro/avro_gpu.h b/cpp/src/io/avro/avro_gpu.h
index a82d3604d02..a895d1bea02 100644
--- a/cpp/src/io/avro/avro_gpu.h
+++ b/cpp/src/io/avro/avro_gpu.h
@@ -33,7 +33,7 @@ struct schemadesc_s {
   uint32_t kind;   // avro type kind
   uint32_t count;  // for records/unions: number of following child columns, for nulls: global
                    // null_count, for enums: dictionary ofs
-  void *dataptr;   // Ptr to column data, or null if column not selected
+  void* dataptr;   // Ptr to column data, or null if column not selected
 };
 
 /**
@@ -50,10 +50,10 @@ struct schemadesc_s {
  * @param[in] min_row_size Minimum size in bytes of a row
  * @param[in] stream CUDA stream to use, default 0
  */
-void DecodeAvroColumnData(block_desc_s *blocks,
-                          schemadesc_s *schema,
+void DecodeAvroColumnData(block_desc_s* blocks,
+                          schemadesc_s* schema,
                           cudf::device_span<string_index_pair> global_dictionary,
-                          const uint8_t *avro_data,
+                          const uint8_t* avro_data,
                           uint32_t num_blocks,
                           uint32_t schema_len,
                           size_t max_rows              = ~0,
diff --git a/cpp/src/io/avro/reader_impl.cu b/cpp/src/io/avro/reader_impl.cu
index 21253ce8cdf..f6ffdd99d35 100644
--- a/cpp/src/io/avro/reader_impl.cu
+++ b/cpp/src/io/avro/reader_impl.cu
@@ -46,7 +46,7 @@ namespace {
 /**
  * @brief Function that translates Avro data kind to cuDF type enum
  */
-type_id to_type_id(const avro::schema_entry *col)
+type_id to_type_id(const avro::schema_entry* col)
 {
   switch (col->kind) {
     case avro::type_boolean: return type_id::BOOL8;
@@ -69,7 +69,7 @@ type_id to_type_id(const avro::schema_entry *col)
  */
 class metadata : public file_metadata {
  public:
-  explicit metadata(datasource *const src) : source(src) {}
+  explicit metadata(datasource* const src) : source(src) {}
 
   /**
    * @brief Initializes the parser and filters down to a subset of rows
@@ -77,7 +77,7 @@ class metadata : public file_metadata {
    * @param[in,out] row_start Starting row of the selection
    * @param[in,out] row_count Total number of rows selected
    */
-  void init_and_select_rows(int &row_start, int &row_count)
+  void init_and_select_rows(int& row_start, int& row_count)
   {
     const auto buffer = source->host_read(0, source->size());
     avro::container pod(buffer->data(), buffer->size());
@@ -100,7 +100,7 @@ class metadata : public file_metadata {
     const auto num_avro_columns = static_cast<int>(columns.size());
     if (!use_names.empty()) {
       int index = 0;
-      for (const auto &use_name : use_names) {
+      for (const auto& use_name : use_names) {
         for (int i = 0; i < num_avro_columns; ++i, ++index) {
           if (index >= num_avro_columns) { index = 0; }
           if (columns[index].name == use_name &&
@@ -135,10 +135,10 @@ class metadata : public file_metadata {
   }
 
  private:
-  datasource *const source;
+  datasource* const source;
 };
 
-rmm::device_buffer reader::impl::decompress_data(const rmm::device_buffer &comp_block_data,
+rmm::device_buffer reader::impl::decompress_data(const rmm::device_buffer& comp_block_data,
                                                  rmm::cuda_stream_view stream)
 {
   size_t uncompressed_data_size = 0;
@@ -149,12 +149,14 @@ rmm::device_buffer reader::impl::decompress_data(const rmm::device_buffer &comp_
     // Guess an initial maximum uncompressed block size
     uint32_t initial_blk_len = (_metadata->max_block_size * 2 + 0xfff) & ~0xfff;
     uncompressed_data_size   = initial_blk_len * _metadata->block_list.size();
-    for (size_t i = 0; i < inflate_in.size(); ++i) { inflate_in[i].dstSize = initial_blk_len; }
+    for (size_t i = 0; i < inflate_in.size(); ++i) {
+      inflate_in[i].dstSize = initial_blk_len;
+    }
   } else if (_metadata->codec == "snappy") {
     // Extract the uncompressed length from the snappy stream
     for (size_t i = 0; i < _metadata->block_list.size(); i++) {
       const auto buffer  = _source->host_read(_metadata->block_list[i].offset, 4);
-      const uint8_t *blk = buffer->data();
+      const uint8_t* blk = buffer->data();
       uint32_t blk_len   = blk[0];
       if (blk_len > 0x7f) {
         blk_len = (blk_len & 0x7f) | (blk[1] << 7);
@@ -176,9 +178,9 @@ rmm::device_buffer reader::impl::decompress_data(const rmm::device_buffer &comp_
   for (size_t i = 0, dst_pos = 0; i < _metadata->block_list.size(); i++) {
     const auto src_pos = _metadata->block_list[i].offset - base_offset;
 
-    inflate_in[i].srcDevice = static_cast<const uint8_t *>(comp_block_data.data()) + src_pos;
+    inflate_in[i].srcDevice = static_cast<const uint8_t*>(comp_block_data.data()) + src_pos;
     inflate_in[i].srcSize   = _metadata->block_list[i].size;
-    inflate_in[i].dstDevice = static_cast<uint8_t *>(decomp_block_data.data()) + dst_pos;
+    inflate_in[i].dstDevice = static_cast<uint8_t*>(decomp_block_data.data()) + dst_pos;
 
     // Update blocks offsets & sizes to refer to uncompressed data
     _metadata->block_list[i].offset = dst_pos;
@@ -215,7 +217,7 @@ rmm::device_buffer reader::impl::decompress_data(const rmm::device_buffer &comp_
       if (actual_uncompressed_size > uncompressed_data_size) {
         decomp_block_data.resize(actual_uncompressed_size, stream);
         for (size_t i = 0, dst_pos = 0; i < _metadata->block_list.size(); i++) {
-          auto dst_base           = static_cast<uint8_t *>(decomp_block_data.data());
+          auto dst_base           = static_cast<uint8_t*>(decomp_block_data.data());
           inflate_in[i].dstDevice = dst_base + dst_pos;
 
           _metadata->block_list[i].offset = dst_pos;
@@ -233,12 +235,12 @@ rmm::device_buffer reader::impl::decompress_data(const rmm::device_buffer &comp_
   return decomp_block_data;
 }
 
-void reader::impl::decode_data(const rmm::device_buffer &block_data,
-                               const std::vector<std::pair<uint32_t, uint32_t>> &dict,
+void reader::impl::decode_data(const rmm::device_buffer& block_data,
+                               const std::vector<std::pair<uint32_t, uint32_t>>& dict,
                                device_span<string_index_pair> global_dictionary,
                                size_t num_rows,
                                std::vector<std::pair<int, std::string>> selection,
-                               std::vector<column_buffer> &out_buffers,
+                               std::vector<column_buffer>& out_buffers,
                                rmm::cuda_stream_view stream)
 {
   // Build gpu schema
@@ -277,7 +279,7 @@ void reader::impl::decode_data(const rmm::device_buffer &block_data,
                                                     _metadata->schema[i + 2].kind == type_null)),
       "Union with non-null type not currently supported");
   }
-  std::vector<void *> valid_alias(out_buffers.size(), nullptr);
+  std::vector<void*> valid_alias(out_buffers.size(), nullptr);
   for (size_t i = 0; i < out_buffers.size(); i++) {
     const auto col_idx  = selection[i].first;
     int schema_data_idx = _metadata->columns[col_idx].schema_data_idx;
@@ -302,10 +304,10 @@ void reader::impl::decode_data(const rmm::device_buffer &block_data,
     _metadata->block_list.data(), _metadata->block_list.size() * sizeof(block_desc_s), stream);
   schema_desc.host_to_device(stream);
 
-  gpu::DecodeAvroColumnData(static_cast<block_desc_s *>(block_list.data()),
+  gpu::DecodeAvroColumnData(static_cast<block_desc_s*>(block_list.data()),
                             schema_desc.device_ptr(),
                             global_dictionary,
-                            static_cast<const uint8_t *>(block_data.data()),
+                            static_cast<const uint8_t*>(block_data.data()),
                             static_cast<uint32_t>(_metadata->block_list.size()),
                             static_cast<uint32_t>(schema_desc.size()),
                             _metadata->num_rows,
@@ -333,15 +335,15 @@ void reader::impl::decode_data(const rmm::device_buffer &block_data,
 }
 
 reader::impl::impl(std::unique_ptr<datasource> source,
-                   avro_reader_options const &options,
-                   rmm::mr::device_memory_resource *mr)
+                   avro_reader_options const& options,
+                   rmm::mr::device_memory_resource* mr)
   : _mr(mr), _source(std::move(source)), _columns(options.get_columns())
 {
   // Open the source Avro dataset metadata
   _metadata = std::make_unique<metadata>(_source.get());
 }
 
-table_with_metadata reader::impl::read(avro_reader_options const &options,
+table_with_metadata reader::impl::read(avro_reader_options const& options,
                                        rmm::cuda_stream_view stream)
 {
   auto skip_rows = options.get_skip_rows();
@@ -358,8 +360,8 @@ table_with_metadata reader::impl::read(avro_reader_options const &options,
   if (selected_columns.size() != 0) {
     // Get a list of column data types
     std::vector<data_type> column_types;
-    for (const auto &col : selected_columns) {
-      auto &col_schema = _metadata->schema[_metadata->columns[col.first].schema_data_idx];
+    for (const auto& col : selected_columns) {
+      auto& col_schema = _metadata->schema[_metadata->columns[col.first].schema_data_idx];
 
       auto col_type = to_type_id(&col_schema);
       CUDF_EXPECTS(col_type != type_id::EMPTY, "Unknown type");
@@ -372,7 +374,7 @@ table_with_metadata reader::impl::read(avro_reader_options const &options,
         block_data      = rmm::device_buffer{_metadata->total_data_size, stream};
         auto read_bytes = _source->device_read(_metadata->block_list[0].offset,
                                                _metadata->total_data_size,
-                                               static_cast<uint8_t *>(block_data.data()),
+                                               static_cast<uint8_t*>(block_data.data()),
                                                stream);
         block_data.resize(read_bytes, stream);
       } else {
@@ -396,11 +398,13 @@ table_with_metadata reader::impl::read(avro_reader_options const &options,
       std::vector<std::pair<uint32_t, uint32_t>> dict(column_types.size());
       for (size_t i = 0; i < column_types.size(); ++i) {
         auto col_idx     = selected_columns[i].first;
-        auto &col_schema = _metadata->schema[_metadata->columns[col_idx].schema_data_idx];
+        auto& col_schema = _metadata->schema[_metadata->columns[col_idx].schema_data_idx];
         dict[i].first    = static_cast<uint32_t>(total_dictionary_entries);
         dict[i].second   = static_cast<uint32_t>(col_schema.symbols.size());
         total_dictionary_entries += dict[i].second;
-        for (const auto &sym : col_schema.symbols) { dictionary_data_size += sym.length(); }
+        for (const auto& sym : col_schema.symbols) {
+          dictionary_data_size += sym.length();
+        }
       }
 
       rmm::device_uvector<string_index_pair> d_global_dict(total_dictionary_entries, stream);
@@ -411,10 +415,10 @@ table_with_metadata reader::impl::read(avro_reader_options const &options,
         size_t dict_pos = 0;
         for (size_t i = 0; i < column_types.size(); ++i) {
           auto const col_idx     = selected_columns[i].first;
-          auto const &col_schema = _metadata->schema[_metadata->columns[col_idx].schema_data_idx];
+          auto const& col_schema = _metadata->schema[_metadata->columns[col_idx].schema_data_idx];
           auto const col_dict_entries = &(h_global_dict[dict[i].first]);
           for (size_t j = 0; j < dict[i].second; j++) {
-            auto const &symbols = col_schema.symbols[j];
+            auto const& symbols = col_schema.symbols[j];
 
             auto const data_dst        = h_global_dict_data.data() + dict_pos;
             auto const len             = symbols.length();
@@ -471,20 +475,20 @@ table_with_metadata reader::impl::read(avro_reader_options const &options,
 }
 
 // Forward to implementation
-reader::reader(std::vector<std::string> const &filepaths,
-               avro_reader_options const &options,
+reader::reader(std::vector<std::string> const& filepaths,
+               avro_reader_options const& options,
                rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource *mr)
+               rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(filepaths.size() == 1, "Only a single source is currently supported.");
   _impl = std::make_unique<impl>(datasource::create(filepaths[0]), options, mr);
 }
 
 // Forward to implementation
-reader::reader(std::vector<std::unique_ptr<cudf::io::datasource>> &&sources,
-               avro_reader_options const &options,
+reader::reader(std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
+               avro_reader_options const& options,
                rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource *mr)
+               rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(sources.size() == 1, "Only a single source is currently supported.");
   _impl = std::make_unique<impl>(std::move(sources[0]), options, mr);
@@ -494,7 +498,7 @@ reader::reader(std::vector<std::unique_ptr<cudf::io::datasource>> &&sources,
 reader::~reader() = default;
 
 // Forward to implementation
-table_with_metadata reader::read(avro_reader_options const &options, rmm::cuda_stream_view stream)
+table_with_metadata reader::read(avro_reader_options const& options, rmm::cuda_stream_view stream)
 {
   return _impl->read(options, stream);
 }
diff --git a/cpp/src/io/avro/reader_impl.hpp b/cpp/src/io/avro/reader_impl.hpp
index 8e09da03563..9af32ed88a0 100644
--- a/cpp/src/io/avro/reader_impl.hpp
+++ b/cpp/src/io/avro/reader_impl.hpp
@@ -61,8 +61,8 @@ class reader::impl {
    * @param mr Device memory resource to use for device memory allocation
    */
   explicit impl(std::unique_ptr<datasource> source,
-                avro_reader_options const &options,
-                rmm::mr::device_memory_resource *mr);
+                avro_reader_options const& options,
+                rmm::mr::device_memory_resource* mr);
 
   /**
    * @brief Read an entire set or a subset of data and returns a set of columns
@@ -72,7 +72,7 @@ class reader::impl {
    *
    * @return The set of columns along with metadata
    */
-  table_with_metadata read(avro_reader_options const &options, rmm::cuda_stream_view stream);
+  table_with_metadata read(avro_reader_options const& options, rmm::cuda_stream_view stream);
 
  private:
   /**
@@ -83,7 +83,7 @@ class reader::impl {
    *
    * @return Device buffer to decompressed block data
    */
-  rmm::device_buffer decompress_data(const rmm::device_buffer &comp_block_data,
+  rmm::device_buffer decompress_data(const rmm::device_buffer& comp_block_data,
                                      rmm::cuda_stream_view stream);
 
   /**
@@ -95,16 +95,16 @@ class reader::impl {
    * @param out_buffers Output columns' device buffers
    * @param stream CUDA stream used for device memory operations and kernel launches.
    */
-  void decode_data(const rmm::device_buffer &block_data,
-                   const std::vector<std::pair<uint32_t, uint32_t>> &dict,
+  void decode_data(const rmm::device_buffer& block_data,
+                   const std::vector<std::pair<uint32_t, uint32_t>>& dict,
                    cudf::device_span<string_index_pair> global_dictionary,
                    size_t num_rows,
                    std::vector<std::pair<int, std::string>> columns,
-                   std::vector<column_buffer> &out_buffers,
+                   std::vector<column_buffer>& out_buffers,
                    rmm::cuda_stream_view stream);
 
  private:
-  rmm::mr::device_memory_resource *_mr = nullptr;
+  rmm::mr::device_memory_resource* _mr = nullptr;
   std::unique_ptr<datasource> _source;
   std::unique_ptr<metadata> _metadata;
 
diff --git a/cpp/src/io/comp/brotli_dict.cpp b/cpp/src/io/comp/brotli_dict.cpp
index b493ebd6bfb..3e6939bb816 100644
--- a/cpp/src/io/comp/brotli_dict.cpp
+++ b/cpp/src/io/comp/brotli_dict.cpp
@@ -6528,7 +6528,7 @@ static const brotli_dictionary_s g_dictionary = {
    136, 224, 164, 184, 224, 164, 149, 224, 165, 141, 224, 164, 176, 224, 164, 191, 224, 164, 175,
    224, 164, 164, 224, 164, 190}};
 
-const brotli_dictionary_s *get_brotli_dictionary(void) { return &g_dictionary; }
+const brotli_dictionary_s* get_brotli_dictionary(void) { return &g_dictionary; }
 
 }  // namespace io
 }  // namespace cudf
diff --git a/cpp/src/io/comp/brotli_dict.h b/cpp/src/io/comp/brotli_dict.h
index c4114b7fbcf..4c1fec1492c 100644
--- a/cpp/src/io/comp/brotli_dict.h
+++ b/cpp/src/io/comp/brotli_dict.h
@@ -79,7 +79,7 @@ struct brotli_dictionary_s {
 constexpr int brotli_min_dictionary_word_length = 4;
 constexpr int brotli_max_dictionary_word_length = 24;
 
-const brotli_dictionary_s *get_brotli_dictionary(void);
+const brotli_dictionary_s* get_brotli_dictionary(void);
 
 }  // namespace io
 }  // namespace cudf
diff --git a/cpp/src/io/comp/cpu_unbz2.cpp b/cpp/src/io/comp/cpu_unbz2.cpp
index 28d7394e485..f4cb6edd41f 100644
--- a/cpp/src/io/comp/cpu_unbz2.cpp
+++ b/cpp/src/io/comp/cpu_unbz2.cpp
@@ -101,13 +101,13 @@ namespace io {
 // Constants for the back end.
 
 #define BZ_MAX_ALPHA_SIZE 258
-#define BZ_MAX_CODE_LEN 23
+#define BZ_MAX_CODE_LEN   23
 
 #define BZ_RUNA 0
 #define BZ_RUNB 1
 
 #define BZ_N_GROUPS 6
-#define BZ_G_SIZE 50
+#define BZ_G_SIZE   50
 
 #define BZ_MAX_SELECTORS (2 + (900000 / BZ_G_SIZE))
 
@@ -121,16 +121,16 @@ typedef struct {
 // Decoder state
 typedef struct {
   // Input
-  const uint8_t *cur;
-  const uint8_t *end;
-  const uint8_t *base;
+  const uint8_t* cur;
+  const uint8_t* end;
+  const uint8_t* base;
   uint64_t bitbuf;
   uint32_t bitpos;
 
   // Output
-  uint8_t *out;
-  uint8_t *outend;
-  uint8_t *outbase;
+  uint8_t* out;
+  uint8_t* outend;
+  uint8_t* outbase;
 
   // misc administratium
   uint32_t blockSize100k;
@@ -156,25 +156,25 @@ typedef struct {
 } unbz_state_s;
 
 // return next 32 bits
-static inline uint32_t next32bits(const unbz_state_s *s)
+static inline uint32_t next32bits(const unbz_state_s* s)
 {
   return (uint32_t)((s->bitbuf << s->bitpos) >> 32);
 }
 
 // return next n bits
-static inline uint32_t showbits(const unbz_state_s *s, uint32_t n)
+static inline uint32_t showbits(const unbz_state_s* s, uint32_t n)
 {
   return (uint32_t)((s->bitbuf << s->bitpos) >> (64 - n));
 }
 
 // update bit position, refill bit buffer if necessary
-static void skipbits(unbz_state_s *s, uint32_t n)
+static void skipbits(unbz_state_s* s, uint32_t n)
 {
   uint32_t bitpos = s->bitpos + n;
   if (bitpos >= 32) {
-    const uint8_t *cur = s->cur + 4;
+    const uint8_t* cur = s->cur + 4;
     uint32_t next32 =
-      (cur + 4 < s->end) ? __builtin_bswap32(*reinterpret_cast<const uint32_t *>(cur + 4)) : 0;
+      (cur + 4 < s->end) ? __builtin_bswap32(*reinterpret_cast<const uint32_t*>(cur + 4)) : 0;
     s->cur    = cur;
     s->bitbuf = (s->bitbuf << 32) | next32;
     bitpos &= 0x1f;
@@ -182,7 +182,7 @@ static void skipbits(unbz_state_s *s, uint32_t n)
   s->bitpos = bitpos;
 }
 
-static inline uint32_t getbits(unbz_state_s *s, uint32_t n)
+static inline uint32_t getbits(unbz_state_s* s, uint32_t n)
 {
   uint32_t bits = showbits(s, n);
   skipbits(s, n);
@@ -190,7 +190,7 @@ static inline uint32_t getbits(unbz_state_s *s, uint32_t n)
 }
 
 /*---------------------------------------------------*/
-int32_t bz2_decompress_block(unbz_state_s *s)
+int32_t bz2_decompress_block(unbz_state_s* s)
 {
   int nInUse;
 
@@ -204,7 +204,7 @@ int32_t bz2_decompress_block(unbz_state_s *s)
   int32_t groupNo;
   int32_t groupPos;
   uint32_t nblock, nblockMAX;
-  const huff_s *gSel = nullptr;
+  const huff_s* gSel = nullptr;
   uint32_t inUse16;
   uint32_t sig0, sig1;
 
@@ -263,11 +263,11 @@ int32_t bz2_decompress_block(unbz_state_s *s)
   // Now the coding tables
   for (t = 0; t < nGroups; t++) {
     int32_t pp, vec;
-    uint8_t *length = &s->len[0];
+    uint8_t* length = &s->len[0];
     int32_t curr    = getbits(s, 5);
     int32_t minLen  = BZ_MAX_CODE_LEN - 1;
     int32_t maxLen  = 0;
-    huff_s *sel     = &s->ht[t];
+    huff_s* sel     = &s->ht[t];
     for (i = 0; i < alphaSize; i++) {
       for (;;) {
         uint32_t v = showbits(s, 2);
@@ -297,9 +297,11 @@ int32_t bz2_decompress_block(unbz_state_s *s)
       sel->base[i]  = 0;
       sel->limit[i] = 0;
     }
-    for (i = 0; i < alphaSize; i++) sel->base[length[i] + 1]++;
+    for (i = 0; i < alphaSize; i++)
+      sel->base[length[i] + 1]++;
 
-    for (i = 1; i < BZ_MAX_CODE_LEN; i++) sel->base[i] += sel->base[i - 1];
+    for (i = 1; i < BZ_MAX_CODE_LEN; i++)
+      sel->base[i] += sel->base[i - 1];
 
     vec = 0;
     for (i = minLen; i <= maxLen; i++) {
@@ -318,7 +320,8 @@ int32_t bz2_decompress_block(unbz_state_s *s)
   EOB       = nInUse + 1;
   nblockMAX = 100000 * s->blockSize100k;
 
-  for (i = 0; i <= 255; i++) s->unzftab[i] = 0;
+  for (i = 0; i <= 255; i++)
+    s->unzftab[i] = 0;
 
   // MTF init
   {
@@ -456,7 +459,7 @@ int32_t bz2_decompress_block(unbz_state_s *s)
   // Verify the end-of-block signature: should be followed by another block or an end-of-stream
   // signature
   {
-    const uint8_t *save_cur = s->cur;
+    const uint8_t* save_cur = s->cur;
     uint64_t save_bitbuf    = s->bitbuf;
     uint32_t save_bitpos    = s->bitpos;
     sig0                    = getbits(s, 24);
@@ -476,14 +479,14 @@ int32_t bz2_decompress_block(unbz_state_s *s)
   }
 }
 
-static void bzUnRLE(unbz_state_s *s)
+static void bzUnRLE(unbz_state_s* s)
 {
-  uint8_t *out    = s->out;
-  uint8_t *outend = s->outend;
+  uint8_t* out    = s->out;
+  uint8_t* outend = s->outend;
 
   int32_t rle_cnt           = s->save_nblock;
   int cprev                 = -1;
-  std::vector<uint32_t> &tt = s->tt;
+  std::vector<uint32_t>& tt = s->tt;
   uint32_t pos              = tt[s->origPtr] >> 8;
   int mask                  = ~0;
 
@@ -520,7 +523,7 @@ static void bzUnRLE(unbz_state_s *s)
 }
 
 int32_t cpu_bz2_uncompress(
-  const uint8_t *source, size_t sourceLen, uint8_t *dest, size_t *destLen, uint64_t *block_start)
+  const uint8_t* source, size_t sourceLen, uint8_t* dest, size_t* destLen, uint64_t* block_start)
 {
   unbz_state_s s{};
   uint32_t v;
@@ -534,7 +537,7 @@ int32_t cpu_bz2_uncompress(
   s.base = source;
   s.end =
     source + sourceLen - 4;  // We will not read the final combined CRC (last 4 bytes of the file)
-  s.bitbuf = __builtin_bswap64(*reinterpret_cast<const uint64_t *>(source));
+  s.bitbuf = __builtin_bswap64(*reinterpret_cast<const uint64_t*>(source));
   s.bitpos = 0;
 
   s.out     = dest;
@@ -560,7 +563,7 @@ int32_t cpu_bz2_uncompress(
       s.cur    = source + (size_t)(bit_offs >> 3);
       s.bitpos = (uint32_t)(bit_offs & 7);
       if (s.cur + 8 > s.end) return BZ_PARAM_ERROR;
-      s.bitbuf = __builtin_bswap64(*reinterpret_cast<const uint64_t *>(s.cur));
+      s.bitbuf = __builtin_bswap64(*reinterpret_cast<const uint64_t*>(s.cur));
     }
   }
 
diff --git a/cpp/src/io/comp/debrotli.cu b/cpp/src/io/comp/debrotli.cu
index 541163eb086..3f38dce3fa3 100644
--- a/cpp/src/io/comp/debrotli.cu
+++ b/cpp/src/io/comp/debrotli.cu
@@ -97,12 +97,12 @@ __inline__ __device__ int brotli_context_lut(int mode) { return (mode << 9); }
 
 inline __device__ uint8_t brotli_transform_type(int idx) { return kTransformsData[(idx * 3) + 1]; }
 
-inline __device__ const uint8_t *brotli_transform_prefix(int idx)
+inline __device__ const uint8_t* brotli_transform_prefix(int idx)
 {
   return &kPrefixSuffix[kPrefixSuffixMap[kTransformsData[(idx * 3)]]];
 }
 
-inline __device__ const uint8_t *brotli_transform_suffix(int idx)
+inline __device__ const uint8_t* brotli_transform_suffix(int idx)
 {
   return &kPrefixSuffix[kPrefixSuffixMap[kTransformsData[(idx * 3) + 2]]];
 }
@@ -138,12 +138,12 @@ struct debrotli_huff_tree_group_s {
   uint16_t max_symbol;
   uint16_t num_htrees;
   uint16_t pad;
-  uint16_t *htrees[1];
+  uint16_t* htrees[1];
 };
 
 // Must be able to at least hold worst-case context maps, tree groups and context modes
 constexpr int local_heap_size =
-  (256 * 64 + 256 * 4 + 3 * (sizeof(debrotli_huff_tree_group_s) + 255 * sizeof(uint16_t *)) + 256 +
+  (256 * 64 + 256 * 4 + 3 * (sizeof(debrotli_huff_tree_group_s) + 255 * sizeof(uint16_t*)) + 256 +
    3 * brotli_huffman_max_size_258 * sizeof(uint16_t) +
    3 * brotli_huffman_max_size_26 * sizeof(uint16_t));
 
@@ -152,15 +152,15 @@ constexpr int local_heap_size =
  */
 struct debrotli_state_s {
   // Bitstream
-  const uint8_t *cur;
-  const uint8_t *end;
-  const uint8_t *base;
+  const uint8_t* cur;
+  const uint8_t* end;
+  const uint8_t* base;
   uint2 bitbuf;
   uint32_t bitpos;
   int32_t error;
   // Output
-  uint8_t *outbase;
-  uint8_t *out;
+  uint8_t* outbase;
+  uint8_t* out;
   size_t bytes_left;
   // Decoded symbols
   uint8_t window_bits;
@@ -178,19 +178,19 @@ struct debrotli_state_s {
   uint32_t meta_block_len;
   uint16_t heap_used;
   uint16_t heap_limit;
-  uint8_t *context_map;
-  uint8_t *dist_context_map;
-  uint8_t *context_modes;
-  uint8_t *fb_base;
+  uint8_t* context_map;
+  uint8_t* dist_context_map;
+  uint8_t* context_modes;
+  uint8_t* fb_base;
   uint32_t fb_size;
   uint8_t block_type_rb[6];
   uint8_t pad[2];
   int dist_rb_idx;
   int dist_rb[4];
-  debrotli_huff_tree_group_s *literal_hgroup;
-  debrotli_huff_tree_group_s *insert_copy_hgroup;
-  debrotli_huff_tree_group_s *distance_hgroup;
-  uint16_t *block_type_vlc[3];
+  debrotli_huff_tree_group_s* literal_hgroup;
+  debrotli_huff_tree_group_s* insert_copy_hgroup;
+  debrotli_huff_tree_group_s* distance_hgroup;
+  uint16_t* block_type_vlc[3];
   huff_scratch_s hs;
   uint32_t mtf[65];
   uint64_t heap[local_heap_size / 8];
@@ -199,54 +199,54 @@ struct debrotli_state_s {
 inline __device__ uint32_t Log2Floor(uint32_t value) { return 32 - __clz(value); }
 
 /// @brief initializes the bit reader
-__device__ void initbits(debrotli_state_s *s, const uint8_t *base, size_t len, size_t pos = 0)
+__device__ void initbits(debrotli_state_s* s, const uint8_t* base, size_t len, size_t pos = 0)
 {
-  const uint8_t *p      = base + pos;
+  const uint8_t* p      = base + pos;
   uint32_t prefix_bytes = (uint32_t)(((size_t)p) & 3);
   p -= prefix_bytes;
   s->base     = base;
   s->end      = base + len;
   s->cur      = p;
-  s->bitbuf.x = (p < s->end) ? *reinterpret_cast<const uint32_t *>(p) : 0;
+  s->bitbuf.x = (p < s->end) ? *reinterpret_cast<const uint32_t*>(p) : 0;
   p += 4;
-  s->bitbuf.y = (p < s->end) ? *reinterpret_cast<const uint32_t *>(p) : 0;
+  s->bitbuf.y = (p < s->end) ? *reinterpret_cast<const uint32_t*>(p) : 0;
   s->bitpos   = prefix_bytes * 8;
 }
 
 // return next 32 bits
-inline __device__ uint32_t next32bits(const debrotli_state_s *s)
+inline __device__ uint32_t next32bits(const debrotli_state_s* s)
 {
   return __funnelshift_rc(s->bitbuf.x, s->bitbuf.y, s->bitpos);
 }
 
 /// return next n bits
-inline __device__ uint32_t showbits(const debrotli_state_s *s, uint32_t n)
+inline __device__ uint32_t showbits(const debrotli_state_s* s, uint32_t n)
 {
   uint32_t next32 = __funnelshift_rc(s->bitbuf.x, s->bitbuf.y, s->bitpos);
   return (next32 & ((1 << n) - 1));
 }
 
-inline __device__ void skipbits(debrotli_state_s *s, uint32_t n)
+inline __device__ void skipbits(debrotli_state_s* s, uint32_t n)
 {
   uint32_t bitpos = s->bitpos + n;
   if (bitpos >= 32) {
-    const uint8_t *cur = s->cur + 8;
+    const uint8_t* cur = s->cur + 8;
     s->bitbuf.x        = s->bitbuf.y;
-    s->bitbuf.y        = (cur < s->end) ? *reinterpret_cast<const uint32_t *>(cur) : 0;
+    s->bitbuf.y        = (cur < s->end) ? *reinterpret_cast<const uint32_t*>(cur) : 0;
     s->cur             = cur - 4;
     bitpos &= 0x1f;
   }
   s->bitpos = bitpos;
 }
 
-inline __device__ uint32_t getbits(debrotli_state_s *s, uint32_t n)
+inline __device__ uint32_t getbits(debrotli_state_s* s, uint32_t n)
 {
   uint32_t bits = showbits(s, n);
   skipbits(s, n);
   return bits;
 }
 
-inline __device__ uint32_t getbits_bytealign(debrotli_state_s *s)
+inline __device__ uint32_t getbits_bytealign(debrotli_state_s* s)
 {
   uint32_t n    = (uint32_t)((-(int32_t)s->bitpos) & 7);
   uint32_t bits = showbits(s, n);
@@ -271,7 +271,7 @@ inline __device__ uint32_t getbits_bytealign(debrotli_state_s *s)
  * 65..128    xxxxxx1101
  * 129..256   xxxxxxx1111
  */
-static __device__ uint32_t getbits_u8vlc(debrotli_state_s *s)
+static __device__ uint32_t getbits_u8vlc(debrotli_state_s* s)
 {
   uint32_t next32 = next32bits(s);
   uint32_t v, len;
@@ -288,7 +288,7 @@ static __device__ uint32_t getbits_u8vlc(debrotli_state_s *s)
 }
 
 /// Decode a Huffman code with 8-bit initial lookup
-static __device__ uint32_t getvlc(debrotli_state_s *s, const uint16_t *lut)
+static __device__ uint32_t getvlc(debrotli_state_s* s, const uint16_t* lut)
 {
   uint32_t next32 = next32bits(s);
   uint32_t vlc, len;
@@ -308,12 +308,12 @@ static __device__ uint32_t getvlc(debrotli_state_s *s, const uint16_t *lut)
 }
 
 /// Alloc bytes from the local (shared mem) heap
-static __device__ uint8_t *local_alloc(debrotli_state_s *s, uint32_t bytes)
+static __device__ uint8_t* local_alloc(debrotli_state_s* s, uint32_t bytes)
 {
   int heap_used = s->heap_used;
   int len       = (bytes + 7) >> 3;
   if (heap_used + len <= s->heap_limit) {
-    uint8_t *ptr = reinterpret_cast<uint8_t *>(&s->heap[heap_used]);
+    uint8_t* ptr = reinterpret_cast<uint8_t*>(&s->heap[heap_used]);
     s->heap_used = (uint16_t)(heap_used + len);
     return ptr;
   } else {
@@ -323,7 +323,7 @@ static __device__ uint8_t *local_alloc(debrotli_state_s *s, uint32_t bytes)
 
 /// Shrink the size of the local heap, returns ptr to end (used for stack-like intermediate
 /// allocations at the end of the heap)
-static __device__ uint8_t *local_heap_shrink(debrotli_state_s *s, uint32_t bytes)
+static __device__ uint8_t* local_heap_shrink(debrotli_state_s* s, uint32_t bytes)
 {
   int heap_used  = s->heap_used;
   int heap_limit = s->heap_limit;
@@ -331,13 +331,13 @@ static __device__ uint8_t *local_heap_shrink(debrotli_state_s *s, uint32_t bytes
   if (heap_limit - len >= heap_used) {
     heap_limit -= len;
     s->heap_limit = (uint16_t)heap_limit;
-    return reinterpret_cast<uint8_t *>(&s->heap[heap_limit]);
+    return reinterpret_cast<uint8_t*>(&s->heap[heap_limit]);
   } else {
     return nullptr;
   }
 }
 
-static __device__ void local_heap_grow(debrotli_state_s *s, uint32_t bytes)
+static __device__ void local_heap_grow(debrotli_state_s* s, uint32_t bytes)
 {
   int len        = (bytes + 7) >> 3;
   int heap_limit = s->heap_limit + len;
@@ -345,16 +345,16 @@ static __device__ void local_heap_grow(debrotli_state_s *s, uint32_t bytes)
 }
 
 /// Alloc memory from the fixed-size heap shared between all blocks (thread0-only)
-static __device__ uint8_t *ext_heap_alloc(uint32_t bytes,
-                                          uint8_t *ext_heap_base,
+static __device__ uint8_t* ext_heap_alloc(uint32_t bytes,
+                                          uint8_t* ext_heap_base,
                                           uint32_t ext_heap_size)
 {
   uint32_t len                = (bytes + 0xf) & ~0xf;
-  volatile uint32_t *heap_ptr = reinterpret_cast<volatile uint32_t *>(ext_heap_base);
+  volatile uint32_t* heap_ptr = reinterpret_cast<volatile uint32_t*>(ext_heap_base);
   uint32_t first_free_block   = ~0;
   for (;;) {
     uint32_t blk_next, blk_prev;
-    first_free_block = atomicExch((unsigned int *)heap_ptr, first_free_block);
+    first_free_block = atomicExch((unsigned int*)heap_ptr, first_free_block);
     if (first_free_block == ~0 || first_free_block >= ext_heap_size) {
       // Some other block is holding the heap or there are no free blocks: try again later
       continue;
@@ -373,7 +373,7 @@ static __device__ uint8_t *ext_heap_alloc(uint32_t bytes,
       uint32_t next, blksz;
       if (((blk_next & 3) != 0) || (blk_next >= ext_heap_size)) {
         // Corrupted heap
-        atomicExch((unsigned int *)heap_ptr, first_free_block);
+        atomicExch((unsigned int*)heap_ptr, first_free_block);
         return nullptr;
       }
       next  = heap_ptr[(blk_next >> 2) + 0];
@@ -398,14 +398,14 @@ static __device__ uint8_t *ext_heap_alloc(uint32_t bytes,
         }
         __threadfence();
         // Restore the list head
-        atomicExch((unsigned int *)heap_ptr, first_free_block);
+        atomicExch((unsigned int*)heap_ptr, first_free_block);
         return ext_heap_base + blk_next;
       } else {
         blk_prev = blk_next;
         blk_next = next;
       }
     } while (blk_next != 0 && blk_next < ext_heap_size);
-    first_free_block = atomicExch((unsigned int *)heap_ptr, first_free_block);
+    first_free_block = atomicExch((unsigned int*)heap_ptr, first_free_block);
     // Reaching here means the heap is full
     // Just in case we're trying to allocate more than the entire heap
     if (len > ext_heap_size - 4 * sizeof(uint32_t)) { break; }
@@ -414,17 +414,17 @@ static __device__ uint8_t *ext_heap_alloc(uint32_t bytes,
 }
 
 /// Free a memory block (thread0-only)
-static __device__ void ext_heap_free(void *ptr,
+static __device__ void ext_heap_free(void* ptr,
                                      uint32_t bytes,
-                                     uint8_t *ext_heap_base,
+                                     uint8_t* ext_heap_base,
                                      uint32_t ext_heap_size)
 {
   uint32_t len                = (bytes + 0xf) & ~0xf;
-  volatile uint32_t *heap_ptr = (volatile uint32_t *)ext_heap_base;
+  volatile uint32_t* heap_ptr = (volatile uint32_t*)ext_heap_base;
   uint32_t first_free_block   = ~0;
-  uint32_t cur_blk            = static_cast<uint32_t>(static_cast<uint8_t *>(ptr) - ext_heap_base);
+  uint32_t cur_blk            = static_cast<uint32_t>(static_cast<uint8_t*>(ptr) - ext_heap_base);
   for (;;) {
-    first_free_block = atomicExch((unsigned int *)heap_ptr, first_free_block);
+    first_free_block = atomicExch((unsigned int*)heap_ptr, first_free_block);
     if (first_free_block != ~0) { break; }
     // Some other block is holding the heap
   }
@@ -485,12 +485,12 @@ static __device__ void ext_heap_free(void *ptr,
     }
   }
   __threadfence();
-  atomicExch((unsigned int *)heap_ptr, first_free_block);
+  atomicExch((unsigned int*)heap_ptr, first_free_block);
 }
 
-static __device__ uint32_t BuildSimpleHuffmanTable(uint16_t *lut,
+static __device__ uint32_t BuildSimpleHuffmanTable(uint16_t* lut,
                                                    int root_bits,
-                                                   uint16_t *val,
+                                                   uint16_t* val,
                                                    uint32_t num_symbols)
 {
   uint32_t table_size      = 1;
@@ -562,7 +562,7 @@ static __device__ uint32_t BuildSimpleHuffmanTable(uint16_t *lut,
   return goal_size;
 }
 
-static __device__ void BuildCodeLengthsHuffmanTable(huff_scratch_s *hs)
+static __device__ void BuildCodeLengthsHuffmanTable(huff_scratch_s* hs)
 {
   uint32_t code;   // current table entry
   int symbol;      // symbol index in original or sorted table
@@ -592,7 +592,9 @@ static __device__ void BuildCodeLengthsHuffmanTable(huff_scratch_s *hs)
   // Special case: all symbols but one have 0 code length.
   if (hs->offset[0] == 0) {
     code = huffcode(0, hs->sorted[0]);
-    for (key = 0; key < table_size; ++key) { hs->lenvlctab[key] = code; }
+    for (key = 0; key < table_size; ++key) {
+      hs->lenvlctab[key] = code;
+    }
     return;
   }
 
@@ -606,7 +608,7 @@ static __device__ void BuildCodeLengthsHuffmanTable(huff_scratch_s *hs)
     for (int bits_count = hs->code_length_histo[bits]; bits_count != 0; --bits_count) {
       int end     = table_size;
       code        = huffcode(bits, hs->sorted[symbol++]);
-      uint16_t *p = &hs->lenvlctab[brev8(key)];
+      uint16_t* p = &hs->lenvlctab[brev8(key)];
       do {
         end -= step;
         p[end] = code;
@@ -621,7 +623,7 @@ static __device__ void BuildCodeLengthsHuffmanTable(huff_scratch_s *hs)
 // Returns the table width of the next 2nd level table. |count| is the histogram
 // of bit lengths for the remaining symbols, |len| is the code length of the
 // next processed symbol.
-static __device__ int NextTableBitSize(const uint16_t *const count, int len, int root_bits)
+static __device__ int NextTableBitSize(const uint16_t* const count, int len, int root_bits)
 {
   int left = 1 << (len - root_bits);
   while (len < 15) {
@@ -634,13 +636,13 @@ static __device__ int NextTableBitSize(const uint16_t *const count, int len, int
 }
 
 // Build a huffman lookup table (currently thread0-only)
-static __device__ uint32_t BuildHuffmanTable(uint16_t *root_lut,
+static __device__ uint32_t BuildHuffmanTable(uint16_t* root_lut,
                                              int root_bits,
-                                             const uint16_t *const symbol_lists,
-                                             uint16_t *count)
+                                             const uint16_t* const symbol_lists,
+                                             uint16_t* count)
 {
   uint32_t code;     // current table entry
-  uint16_t *lut;     // next available space in table
+  uint16_t* lut;     // next available space in table
   int len;           // current code length
   int symbol;        // symbol index in original or sorted table
   int key;           // prefix code
@@ -654,7 +656,8 @@ static __device__ uint32_t BuildHuffmanTable(uint16_t *root_lut,
   int max_length = -1;
   int bits;
 
-  while (symbol_lists[max_length] == 0xFFFF) max_length--;
+  while (symbol_lists[max_length] == 0xFFFF)
+    max_length--;
   max_length += 16;
 
   lut        = root_lut;
@@ -677,7 +680,7 @@ static __device__ uint32_t BuildHuffmanTable(uint16_t *root_lut,
     for (int bits_count = count[bits]; bits_count != 0; --bits_count) {
       symbol      = symbol_lists[symbol];
       code        = huffcode(bits, symbol);
-      uint16_t *p = &lut[brev8(key)];
+      uint16_t* p = &lut[brev8(key)];
       int end     = table_size;
       do {
         end -= step;
@@ -715,7 +718,7 @@ static __device__ uint32_t BuildHuffmanTable(uint16_t *root_lut,
       }
       symbol      = symbol_lists[symbol];
       code        = huffcode(len - root_bits, symbol);
-      uint16_t *p = &lut[brev8(sub_key)];
+      uint16_t* p = &lut[brev8(sub_key)];
       int end     = table_size;
       do {
         end -= step;
@@ -883,10 +886,10 @@ invalid.
 */
 
 // Decode Huffman tree (thread0-only)
-static __device__ uint32_t DecodeHuffmanTree(debrotli_state_s *s,
+static __device__ uint32_t DecodeHuffmanTree(debrotli_state_s* s,
                                              uint32_t alphabet_size,
                                              uint32_t max_symbol,
-                                             uint16_t *vlctab)
+                                             uint16_t* vlctab)
 {
   uint32_t prefix_code_type;
 
@@ -916,8 +919,8 @@ static __device__ uint32_t DecodeHuffmanTree(debrotli_state_s *s,
       vlctab, huffman_lookup_table_width, s->hs.symbols_lists_array, nsym);
   } else {
     // Complex prefix code
-    huff_scratch_s *const hs = &s->hs;
-    uint16_t *symbol_lists =
+    huff_scratch_s* const hs = &s->hs;
+    uint16_t* symbol_lists =
       &s->hs.symbols_lists_array[16];  // Make small negative indexes addressable
     uint32_t space = 32, num_codes = 0, i, prev_code_len, symbol, repeat, repeat_code_len;
 
@@ -1069,7 +1072,7 @@ formula :
 
 window size = (1 << WBITS) - 16
 */
-static __device__ void DecodeStreamHeader(debrotli_state_s *s)
+static __device__ void DecodeStreamHeader(debrotli_state_s* s)
 {
   uint32_t next32 = next32bits(s);
   uint32_t wbits = 0, len = 0;
@@ -1155,7 +1158,7 @@ not set(if the ignored bits are not all zeros, the stream should be rejected
 as invalid)
 */
 
-static __device__ void DecodeMetaBlockHeader(debrotli_state_s *s)
+static __device__ void DecodeMetaBlockHeader(debrotli_state_s* s)
 {
   uint32_t next32 = next32bits(s);
   uint32_t len = 1, is_empty = 0;
@@ -1195,7 +1198,9 @@ static __device__ void DecodeMetaBlockHeader(debrotli_state_s *s)
       }
       skipbits(s, len);
       if (getbits_bytealign(s) != 0) { s->error = 1; }
-      for (len = mskiplen; len >= 32; len -= 32) { skipbits(s, 32); }
+      for (len = mskiplen; len >= 32; len -= 32) {
+        skipbits(s, 32);
+      }
     }
   }
   skipbits(s, len);
@@ -1238,17 +1243,17 @@ Block count code + extra bits for first distance block count,
 appears only if NBLTYPESD >= 2
 */
 
-static __device__ void DecodeHuffmanTables(debrotli_state_s *s)
+static __device__ void DecodeHuffmanTables(debrotli_state_s* s)
 {
   for (int b = 0; b < 3; b++) {
     uint32_t nbltypes     = 1 + getbits_u8vlc(s);
     s->num_block_types[b] = nbltypes;
     if (nbltypes >= 2) {
       uint32_t alphabet_size = nbltypes + 2, index, nbits, maxtblsz;
-      uint16_t *vlctab;
+      uint16_t* vlctab;
       maxtblsz = kMaxHuffmanTableSize[(alphabet_size + 31) >> 5];
       maxtblsz = (maxtblsz > brotli_huffman_max_size_258) ? brotli_huffman_max_size_258 : maxtblsz;
-      vlctab   = reinterpret_cast<uint16_t *>(
+      vlctab   = reinterpret_cast<uint16_t*>(
         local_alloc(s, (brotli_huffman_max_size_26 + maxtblsz) * sizeof(uint16_t)));
       s->block_type_vlc[b] = vlctab;
       DecodeHuffmanTree(s, alphabet_size, alphabet_size, vlctab + brotli_huffman_max_size_26);
@@ -1286,13 +1291,13 @@ static __device__ void DecodeHuffmanTables(debrotli_state_s *s)
  * Most of input values are 0 and 1. To reduce number of branches, we replace
  * inner for loop with do-while.
  */
-static __device__ void InverseMoveToFrontTransform(debrotli_state_s *s, uint8_t *v, uint32_t v_len)
+static __device__ void InverseMoveToFrontTransform(debrotli_state_s* s, uint8_t* v, uint32_t v_len)
 {
   // Reinitialize elements that could have been changed.
   uint32_t i           = 1;
   uint32_t upper_bound = s->mtf_upper_bound;
-  uint32_t *mtf        = &s->mtf[1];  // Make mtf[-1] addressable.
-  uint8_t *mtf_u8      = reinterpret_cast<uint8_t *>(mtf);
+  uint32_t* mtf        = &s->mtf[1];  // Make mtf[-1] addressable.
+  uint8_t* mtf_u8      = reinterpret_cast<uint8_t*>(mtf);
   uint32_t pattern     = 0x03020100;  // Little-endian
 
   // Initialize list using 4 consequent values pattern.
@@ -1320,10 +1325,10 @@ static __device__ void InverseMoveToFrontTransform(debrotli_state_s *s, uint8_t
   s->mtf_upper_bound = upper_bound >> 2;
 }
 
-static __device__ uint32_t DecodeContextMap(debrotli_state_s *s,
-                                            uint8_t *context_map,
+static __device__ uint32_t DecodeContextMap(debrotli_state_s* s,
+                                            uint8_t* context_map,
                                             uint32_t context_map_size,
-                                            uint16_t *context_map_vlc)
+                                            uint16_t* context_map_vlc)
 {
   uint32_t num_htrees = getbits_u8vlc(s) + 1;
   uint32_t bits, context_index, max_run_length_prefix, alphabet_size;
@@ -1367,7 +1372,7 @@ static __device__ uint32_t DecodeContextMap(debrotli_state_s *s,
   return num_htrees;
 }
 
-static __device__ void DetectTrivialLiteralBlockTypes(debrotli_state_s *s)
+static __device__ void DetectTrivialLiteralBlockTypes(debrotli_state_s* s)
 {
   uint32_t i;
   for (i = 0; i < s->num_block_types[0]; i++) {
@@ -1375,7 +1380,9 @@ static __device__ void DetectTrivialLiteralBlockTypes(debrotli_state_s *s)
     uint32_t error  = 0;
     uint32_t sample = s->context_map[offset];
     uint32_t j;
-    for (j = 0; j < (1u << 6); ++j) { error |= s->context_map[offset + j] ^ sample; }
+    for (j = 0; j < (1u << 6); ++j) {
+      error |= s->context_map[offset + j] ^ sample;
+    }
     if (error == 0) { s->context_modes[i] |= 4u; }
   }
 }
@@ -1405,13 +1412,13 @@ appears only if NTREESD >= 2; otherwise, the context map has
 only zero values
 */
 
-static __device__ debrotli_huff_tree_group_s *HuffmanTreeGroupInit(debrotli_state_s *s,
+static __device__ debrotli_huff_tree_group_s* HuffmanTreeGroupInit(debrotli_state_s* s,
                                                                    uint32_t alphabet_size,
                                                                    uint32_t max_symbol,
                                                                    uint32_t ntrees)
 {
-  debrotli_huff_tree_group_s *group = reinterpret_cast<debrotli_huff_tree_group_s *>(local_alloc(
-    s, sizeof(debrotli_huff_tree_group_s) + ntrees * sizeof(uint16_t *) - sizeof(uint16_t *)));
+  debrotli_huff_tree_group_s* group = reinterpret_cast<debrotli_huff_tree_group_s*>(local_alloc(
+    s, sizeof(debrotli_huff_tree_group_s) + ntrees * sizeof(uint16_t*) - sizeof(uint16_t*)));
   group->alphabet_size              = (uint16_t)alphabet_size;
   group->max_symbol                 = (uint16_t)max_symbol;
   group->num_htrees                 = (uint16_t)ntrees;
@@ -1419,26 +1426,26 @@ static __device__ debrotli_huff_tree_group_s *HuffmanTreeGroupInit(debrotli_stat
   return group;
 }
 
-static __device__ void HuffmanTreeGroupAlloc(debrotli_state_s *s, debrotli_huff_tree_group_s *group)
+static __device__ void HuffmanTreeGroupAlloc(debrotli_state_s* s, debrotli_huff_tree_group_s* group)
 {
   if (!group->htrees[0]) {
     uint32_t alphabet_size  = group->alphabet_size;
     uint32_t ntrees         = group->num_htrees;
     uint32_t max_table_size = kMaxHuffmanTableSize[(alphabet_size + 31) >> 5];
     uint32_t code_size      = sizeof(uint16_t) * ntrees * max_table_size;
-    group->htrees[0]        = reinterpret_cast<uint16_t *>(local_alloc(s, code_size));
+    group->htrees[0]        = reinterpret_cast<uint16_t*>(local_alloc(s, code_size));
     if (!group->htrees[0]) {
-      if (s->fb_base) { group->htrees[0] = reinterpret_cast<uint16_t *>(s->fb_base + s->fb_size); }
+      if (s->fb_base) { group->htrees[0] = reinterpret_cast<uint16_t*>(s->fb_base + s->fb_size); }
       s->fb_size += (code_size + 3) & ~3;
     }
   }
 }
 
 // Decodes a series of Huffman table using ReadHuffmanCode function.
-static __device__ void HuffmanTreeGroupDecode(debrotli_state_s *s,
-                                              debrotli_huff_tree_group_s *group)
+static __device__ void HuffmanTreeGroupDecode(debrotli_state_s* s,
+                                              debrotli_huff_tree_group_s* group)
 {
-  uint16_t *next = group->htrees[0];
+  uint16_t* next = group->htrees[0];
 
   for (int htree_index = 0; htree_index < group->num_htrees; htree_index++) {
     uint32_t table_size = DecodeHuffmanTree(s, group->alphabet_size, group->max_symbol, next);
@@ -1448,13 +1455,13 @@ static __device__ void HuffmanTreeGroupDecode(debrotli_state_s *s,
   }
 }
 
-static __device__ void DecodeHuffmanTreeGroups(debrotli_state_s *s,
-                                               uint8_t *fb_heap_base,
+static __device__ void DecodeHuffmanTreeGroups(debrotli_state_s* s,
+                                               uint8_t* fb_heap_base,
                                                uint32_t fb_heap_size)
 {
   uint32_t bits, npostfix, ndirect, nbltypesl;
   uint32_t context_map_size;
-  uint16_t *context_map_vlc;
+  uint16_t* context_map_vlc;
   uint32_t num_direct_codes, num_distance_codes, num_literal_htrees, num_dist_htrees;
 
   // Decode context maps
@@ -1466,8 +1473,10 @@ static __device__ void DecodeHuffmanTreeGroups(debrotli_state_s *s,
   s->distance_postfix_mask     = (1 << npostfix) - 1;
   nbltypesl                    = s->num_block_types[0];
   s->context_modes             = local_alloc(s, nbltypesl);
-  for (uint32_t i = 0; i < nbltypesl; i++) { s->context_modes[i] = getbits(s, 2); }
-  context_map_vlc = reinterpret_cast<uint16_t *>(
+  for (uint32_t i = 0; i < nbltypesl; i++) {
+    s->context_modes[i] = getbits(s, 2);
+  }
+  context_map_vlc = reinterpret_cast<uint16_t*>(
     local_heap_shrink(s, brotli_huffman_max_size_272 * sizeof(uint16_t)));
   context_map_size   = nbltypesl << 6;
   s->context_map     = local_alloc(s, context_map_size);
@@ -1514,7 +1523,7 @@ static __device__ void DecodeHuffmanTreeGroups(debrotli_state_s *s,
   HuffmanTreeGroupDecode(s, s->distance_hgroup);
 }
 
-static __device__ int PrepareLiteralDecoding(debrotli_state_s *s, const uint8_t *&context_map_slice)
+static __device__ int PrepareLiteralDecoding(debrotli_state_s* s, const uint8_t*& context_map_slice)
 {
   int context_mode;
   uint32_t block_type     = s->block_type_rb[1];
@@ -1525,13 +1534,13 @@ static __device__ int PrepareLiteralDecoding(debrotli_state_s *s, const uint8_t
 }
 
 /// Decodes a command or literal and updates block type ring-buffer. Reads 3..54 bits.
-static __device__ uint32_t DecodeBlockTypeAndLength(debrotli_state_s *s, int tree_type)
+static __device__ uint32_t DecodeBlockTypeAndLength(debrotli_state_s* s, int tree_type)
 {
   uint32_t max_block_type = s->num_block_types[tree_type];
   if (max_block_type > 1) {
-    const uint16_t *len_tree  = s->block_type_vlc[tree_type];
-    const uint16_t *type_tree = len_tree + brotli_huffman_max_size_26;
-    uint8_t *ringbuffer       = &s->block_type_rb[tree_type * 2];
+    const uint16_t* len_tree  = s->block_type_vlc[tree_type];
+    const uint16_t* type_tree = len_tree + brotli_huffman_max_size_26;
+    uint8_t* ringbuffer       = &s->block_type_rb[tree_type * 2];
     // Read 0..15 + 3..39 bits.
     uint32_t block_type = getvlc(s, type_tree);
     uint32_t block_len  = getvlc(s, len_tree);
@@ -1553,7 +1562,7 @@ static __device__ uint32_t DecodeBlockTypeAndLength(debrotli_state_s *s, int tre
   }
 }
 
-inline __device__ int ToUpperCase(uint8_t *p)
+inline __device__ int ToUpperCase(uint8_t* p)
 {
   if (p[0] < 0xC0) {
     if (p[0] >= 'a' && p[0] <= 'z') { p[0] ^= 32; }
@@ -1569,18 +1578,20 @@ inline __device__ int ToUpperCase(uint8_t *p)
   return 3;
 }
 
-static __device__ int TransformDictionaryWord(uint8_t *dst,
-                                              const uint8_t *word,
+static __device__ int TransformDictionaryWord(uint8_t* dst,
+                                              const uint8_t* word,
                                               int len,
                                               int transform_idx)
 {
   int idx               = 0;
-  const uint8_t *prefix = brotli_transform_prefix(transform_idx);
+  const uint8_t* prefix = brotli_transform_prefix(transform_idx);
   uint8_t type          = brotli_transform_type(transform_idx);
-  const uint8_t *suffix = brotli_transform_suffix(transform_idx);
+  const uint8_t* suffix = brotli_transform_suffix(transform_idx);
   {
     int prefix_len = *prefix++;
-    while (prefix_len--) { dst[idx++] = *prefix++; }
+    while (prefix_len--) {
+      dst[idx++] = *prefix++;
+    }
   }
   {
     const int t = type;
@@ -1592,11 +1603,13 @@ static __device__ int TransformDictionaryWord(uint8_t *dst,
       word += skip;
       len -= skip;
     }
-    while (i < len) { dst[idx++] = word[i++]; }
+    while (i < len) {
+      dst[idx++] = word[i++];
+    }
     if (t == BROTLI_TRANSFORM_UPPERCASE_FIRST) {
       ToUpperCase(&dst[idx - len]);
     } else if (t == BROTLI_TRANSFORM_UPPERCASE_ALL) {
-      uint8_t *uppercase = &dst[idx - len];
+      uint8_t* uppercase = &dst[idx - len];
       while (len > 0) {
         int step = ToUpperCase(uppercase);
         uppercase += step;
@@ -1606,24 +1619,26 @@ static __device__ int TransformDictionaryWord(uint8_t *dst,
   }
   {
     int suffix_len = *suffix++;
-    while (suffix_len--) { dst[idx++] = *suffix++; }
+    while (suffix_len--) {
+      dst[idx++] = *suffix++;
+    }
     return idx;
   }
 }
 
 /// ProcessCommands, actual decoding: 1 warp, most work done by thread0
-static __device__ void ProcessCommands(debrotli_state_s *s, const brotli_dictionary_s *words, int t)
+static __device__ void ProcessCommands(debrotli_state_s* s, const brotli_dictionary_s* words, int t)
 {
   int32_t meta_block_len = s->meta_block_len;
-  uint8_t *out           = s->out;
+  uint8_t* out           = s->out;
   int32_t pos            = 0;
   int p1                 = s->p1;
   int p2                 = s->p2;
-  const uint16_t *htree_command;
+  const uint16_t* htree_command;
   const uint8_t *context_map_slice, *dist_context_map_slice;
   int dist_rb_idx;
   uint32_t blen_L, blen_I, blen_D;
-  uint8_t *const dict_scratch = reinterpret_cast<uint8_t *>(
+  uint8_t* const dict_scratch = reinterpret_cast<uint8_t*>(
     &s->hs);  // 24+13 bytes (max length of a dictionary word including prefix & suffix)
   int context_mode;
 
@@ -1678,7 +1693,7 @@ static __device__ void ProcessCommands(debrotli_state_s *s, const brotli_diction
               insert_length -= len;
               blen_L -= len;
               if (brotli_need_context_lut(context_mode)) {
-                const debrotli_huff_tree_group_s *literal_hgroup = s->literal_hgroup;
+                const debrotli_huff_tree_group_s* literal_hgroup = s->literal_hgroup;
                 do {
                   int context = brotli_context(p1, p2, context_mode);
                   p2          = p1;
@@ -1686,7 +1701,7 @@ static __device__ void ProcessCommands(debrotli_state_s *s, const brotli_diction
                   out[pos++]  = p1;
                 } while (--len);
               } else {
-                const uint16_t *literal_htree = s->literal_hgroup->htrees[context_map_slice[0]];
+                const uint16_t* literal_htree = s->literal_hgroup->htrees[context_map_slice[0]];
                 do {
                   p2         = p1;
                   p1         = getvlc(s, literal_htree);
@@ -1704,7 +1719,7 @@ static __device__ void ProcessCommands(debrotli_state_s *s, const brotli_diction
             distance_code    = s->dist_rb[dist_rb_idx & 3];
             distance_context = 1;
           } else {
-            const uint16_t *distance_tree;
+            const uint16_t* distance_tree;
             int distval;
             // Read distance code in the command, unless it was implicitly zero.
             if (blen_D == 0) {
@@ -1847,7 +1862,7 @@ static __device__ void ProcessCommands(debrotli_state_s *s, const brotli_diction
       if (distance_code > 0) {
         // Copy
         for (uint32_t i = t; i < copy_length; i += 32) {
-          const uint8_t *src =
+          const uint8_t* src =
             out + pos + ((i >= (uint32_t)distance_code) ? (i % (uint32_t)distance_code) : i) -
             distance_code;
           b            = *src;
@@ -1855,7 +1870,7 @@ static __device__ void ProcessCommands(debrotli_state_s *s, const brotli_diction
         }
       } else {
         // Dictionary
-        const uint8_t *src = (distance_code < 0) ? &words->data[-distance_code] : dict_scratch;
+        const uint8_t* src = (distance_code < 0) ? &words->data[-distance_code] : dict_scratch;
         if (t < copy_length) {
           b            = src[t];
           out[pos + t] = b;
@@ -1891,9 +1906,9 @@ static __device__ void ProcessCommands(debrotli_state_s *s, const brotli_diction
  * @param count Number of blocks to decompress
  */
 extern "C" __global__ void __launch_bounds__(block_size, 2)
-  gpu_debrotli_kernel(gpu_inflate_input_s *inputs,
-                      gpu_inflate_status_s *outputs,
-                      uint8_t *scratch,
+  gpu_debrotli_kernel(gpu_inflate_input_s* inputs,
+                      gpu_inflate_status_s* outputs,
+                      uint8_t* scratch,
                       uint32_t scratch_size,
                       uint32_t count)
 {
@@ -1901,16 +1916,16 @@ extern "C" __global__ void __launch_bounds__(block_size, 2)
 
   int t                     = threadIdx.x;
   int z                     = blockIdx.x;
-  debrotli_state_s *const s = &state_g;
+  debrotli_state_s* const s = &state_g;
 
   if (z >= count) { return; }
   // Thread0: initializes shared state and decode stream header
   if (!t) {
-    uint8_t const *src = static_cast<uint8_t const *>(inputs[z].srcDevice);
+    uint8_t const* src = static_cast<uint8_t const*>(inputs[z].srcDevice);
     size_t src_size    = inputs[z].srcSize;
     if (src && src_size >= 8) {
       s->error = 0;
-      s->out = s->outbase = static_cast<uint8_t *>(inputs[z].dstDevice);
+      s->out = s->outbase = static_cast<uint8_t*>(inputs[z].dstDevice);
       s->bytes_left       = inputs[z].dstSize;
       s->mtf_upper_bound  = 63;
       s->dist_rb[0]       = 16;
@@ -1940,8 +1955,8 @@ extern "C" __global__ void __launch_bounds__(block_size, 2)
       if (!s->error && s->meta_block_len != 0) {
         if (s->is_uncompressed) {
           // Uncompressed block
-          const uint8_t *src = s->cur + ((s->bitpos + 7) >> 3);
-          uint8_t *dst       = s->out;
+          const uint8_t* src = s->cur + ((s->bitpos + 7) >> 3);
+          uint8_t* dst       = s->out;
           if (!t) {
             if (getbits_bytealign(s) != 0) {
               s->error = -1;
@@ -1954,7 +1969,9 @@ extern "C" __global__ void __launch_bounds__(block_size, 2)
           __syncthreads();
           if (!s->error) {
             // Simple block-wide memcpy
-            for (int32_t i = t; i < s->meta_block_len; i += block_size) { dst[i] = src[i]; }
+            for (int32_t i = t; i < s->meta_block_len; i += block_size) {
+              dst[i] = src[i];
+            }
           }
         } else {
           // Compressed block
@@ -1971,8 +1988,7 @@ extern "C" __global__ void __launch_bounds__(block_size, 2)
           if (!s->error) {
             // Warp0: Decode compressed block, warps 1..7 are all idle (!)
             if (t < 32)
-              ProcessCommands(
-                s, reinterpret_cast<brotli_dictionary_s *>(scratch + scratch_size), t);
+              ProcessCommands(s, reinterpret_cast<brotli_dictionary_s*>(scratch + scratch_size), t);
             __syncthreads();
           }
           // Free any allocated memory
@@ -2053,16 +2069,16 @@ size_t __host__ get_gpu_debrotli_scratch_size(int max_num_inputs)
 #include <stdio.h>
 #endif
 
-cudaError_t __host__ gpu_debrotli(gpu_inflate_input_s *inputs,
-                                  gpu_inflate_status_s *outputs,
-                                  void *scratch,
+cudaError_t __host__ gpu_debrotli(gpu_inflate_input_s* inputs,
+                                  gpu_inflate_status_s* outputs,
+                                  void* scratch,
                                   size_t scratch_size,
                                   int count,
                                   rmm::cuda_stream_view stream)
 {
   uint32_t count32 = (count > 0) ? count : 0;
   uint32_t fb_heap_size;
-  uint8_t *scratch_u8 = static_cast<uint8_t *>(scratch);
+  uint8_t* scratch_u8 = static_cast<uint8_t*>(scratch);
   dim3 dim_block(block_size, 1);
   dim3 dim_grid(count32, 1);  // TODO: Check max grid dimensions vs max expected count
 
diff --git a/cpp/src/io/comp/gpuinflate.cu b/cpp/src/io/comp/gpuinflate.cu
index eda1d37f78c..338af72e4c9 100644
--- a/cpp/src/io/comp/gpuinflate.cu
+++ b/cpp/src/io/comp/gpuinflate.cu
@@ -102,15 +102,15 @@ constexpr int prefetch_size      = (1 << log2_prefetch_size);
 
 /// @brief Prefetcher state
 struct prefetch_queue_s {
-  const uint8_t *cur_p;  ///< Prefetch location
+  const uint8_t* cur_p;  ///< Prefetch location
   int run;               ///< prefetcher will exit when run=0
   uint8_t pref_data[prefetch_size];
 };
 
 template <typename T>
-inline __device__ volatile uint32_t *prefetch_addr32(volatile prefetch_queue_s &q, T *ptr)
+inline __device__ volatile uint32_t* prefetch_addr32(volatile prefetch_queue_s& q, T* ptr)
 {
-  return reinterpret_cast<volatile uint32_t *>(&q.pref_data[(prefetch_size - 4) & (size_t)(ptr)]);
+  return reinterpret_cast<volatile uint32_t*>(&q.pref_data[(prefetch_size - 4) & (size_t)(ptr)]);
 }
 
 #endif  // ENABLE_PREFETCH
@@ -120,12 +120,12 @@ inline __device__ volatile uint32_t *prefetch_addr32(volatile prefetch_queue_s &
  */
 struct inflate_state_s {
   // output state
-  uint8_t *out;      ///< output buffer
-  uint8_t *outbase;  ///< start of output buffer
-  uint8_t *outend;   ///< end of output buffer
+  uint8_t* out;      ///< output buffer
+  uint8_t* outbase;  ///< start of output buffer
+  uint8_t* outend;   ///< end of output buffer
   // Input state
-  uint8_t *cur;  ///< input buffer
-  uint8_t *end;  ///< end of input buffer
+  uint8_t* cur;  ///< input buffer
+  uint8_t* end;  ///< end of input buffer
 
   uint2 bitbuf;     ///< bit buffer (64-bit)
   uint32_t bitpos;  ///< position in bit buffer
@@ -165,24 +165,24 @@ inline __device__ unsigned int bfe(unsigned int source,
   return bits;
 };
 
-inline __device__ uint32_t showbits(inflate_state_s *s, uint32_t n)
+inline __device__ uint32_t showbits(inflate_state_s* s, uint32_t n)
 {
   uint32_t next32 = __funnelshift_rc(s->bitbuf.x, s->bitbuf.y, s->bitpos);
   return (next32 & ((1 << n) - 1));
 }
 
-inline __device__ uint32_t nextbits32(inflate_state_s *s)
+inline __device__ uint32_t nextbits32(inflate_state_s* s)
 {
   return __funnelshift_rc(s->bitbuf.x, s->bitbuf.y, s->bitpos);
 }
 
-inline __device__ void skipbits(inflate_state_s *s, uint32_t n)
+inline __device__ void skipbits(inflate_state_s* s, uint32_t n)
 {
   uint32_t bitpos = s->bitpos + n;
   if (bitpos >= 32) {
-    uint8_t *cur = s->cur + 8;
+    uint8_t* cur = s->cur + 8;
     s->bitbuf.x  = s->bitbuf.y;
-    s->bitbuf.y  = (cur < s->end) ? *reinterpret_cast<uint32_t *>(cur) : 0;
+    s->bitbuf.y  = (cur < s->end) ? *reinterpret_cast<uint32_t*>(cur) : 0;
     s->cur       = cur - 4;
     bitpos &= 0x1f;
   }
@@ -191,7 +191,7 @@ inline __device__ void skipbits(inflate_state_s *s, uint32_t n)
 
 // TODO: If we require 4-byte alignment of input bitstream & length (padded), reading bits would
 // become quite a bit faster
-__device__ uint32_t getbits(inflate_state_s *s, uint32_t n)
+__device__ uint32_t getbits(inflate_state_s* s, uint32_t n)
 {
   uint32_t v = showbits(s, n);
   skipbits(s, n);
@@ -222,7 +222,7 @@ __device__ uint32_t getbits(inflate_state_s *s, uint32_t n)
  * - Incomplete codes are handled by this decoder, since they are permitted
  *   in the deflate format.  See the format notes for fixed() and dynamic().
  */
-__device__ int decode(inflate_state_s *s, const int16_t *counts, const int16_t *symbols)
+__device__ int decode(inflate_state_s* s, const int16_t* counts, const int16_t* symbols)
 {
   unsigned int len;    // current number of bits in code
   unsigned int code;   // len bits being decoded
@@ -279,15 +279,16 @@ __device__ int decode(inflate_state_s *s, const int16_t *counts, const int16_t *
  *   the code bits definition.
  */
 __device__ int construct(
-  inflate_state_s *s, int16_t *counts, int16_t *symbols, const int16_t *length, int n)
+  inflate_state_s* s, int16_t* counts, int16_t* symbols, const int16_t* length, int n)
 {
   int symbol;  // current symbol when stepping through length[]
   int len;     // current length when stepping through counts[]
   int left;    // number of possible codes left of current length
-  int16_t *offs = s->u.scratch.offs;
+  int16_t* offs = s->u.scratch.offs;
 
   // count number of codes of each length
-  for (len = 0; len <= max_bits; len++) counts[len] = 0;
+  for (len = 0; len <= max_bits; len++)
+    counts[len] = 0;
   for (symbol = 0; symbol < n; symbol++)
     (counts[length[symbol]])++;  // assumes lengths are within bounds
   if (counts[0] == n)            // no codes!
@@ -303,7 +304,8 @@ __device__ int construct(
 
   // generate offsets into symbol table for each length for sorting
   offs[1] = 0;
-  for (len = 1; len < max_bits; len++) offs[len + 1] = offs[len] + counts[len];
+  for (len = 1; len < max_bits; len++)
+    offs[len + 1] = offs[len] + counts[len];
 
   // put symbols in table sorted by length, by symbol order within each length
   for (symbol = 0; symbol < n; symbol++)
@@ -318,12 +320,12 @@ static const __device__ __constant__ uint8_t g_code_order[19 + 1] = {
   16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15, 0xff};
 
 /// Dynamic block (custom huffman tables)
-__device__ int init_dynamic(inflate_state_s *s)
+__device__ int init_dynamic(inflate_state_s* s)
 {
   int nlen, ndist, ncode; /* number of lengths in descriptor */
   int index;              /* index of lengths[] */
   int err;                /* construct() return value */
-  int16_t *lengths = s->u.scratch.lengths;
+  int16_t* lengths = s->u.scratch.lengths;
 
   // get number of lengths in each table, check lengths
   nlen  = getbits(s, 5) + 257;
@@ -333,8 +335,10 @@ __device__ int init_dynamic(inflate_state_s *s)
     return -3;  // bad counts
   }
   // read code length code lengths (really), missing lengths are zero
-  for (index = 0; index < ncode; index++) lengths[g_code_order[index]] = getbits(s, 3);
-  for (; index < 19; index++) lengths[g_code_order[index]] = 0;
+  for (index = 0; index < ncode; index++)
+    lengths[g_code_order[index]] = getbits(s, 3);
+  for (; index < 19; index++)
+    lengths[g_code_order[index]] = 0;
 
   // build huffman table for code lengths codes (use lencode temporarily)
   err = construct(s, s->lencnt, s->lensym, lengths, 19);
@@ -404,20 +408,25 @@ __device__ int init_dynamic(inflate_state_s *s)
  *   length, this can be implemented as an incomplete code.  Then the invalid
  *   codes are detected while decoding.
  */
-__device__ int init_fixed(inflate_state_s *s)
+__device__ int init_fixed(inflate_state_s* s)
 {
-  int16_t *lengths = s->u.scratch.lengths;
+  int16_t* lengths = s->u.scratch.lengths;
   int symbol;
 
   // literal/length table
-  for (symbol = 0; symbol < 144; symbol++) lengths[symbol] = 8;
-  for (; symbol < 256; symbol++) lengths[symbol] = 9;
-  for (; symbol < 280; symbol++) lengths[symbol] = 7;
-  for (; symbol < fix_l_codes; symbol++) lengths[symbol] = 8;
+  for (symbol = 0; symbol < 144; symbol++)
+    lengths[symbol] = 8;
+  for (; symbol < 256; symbol++)
+    lengths[symbol] = 9;
+  for (; symbol < 280; symbol++)
+    lengths[symbol] = 7;
+  for (; symbol < fix_l_codes; symbol++)
+    lengths[symbol] = 8;
   construct(s, s->lencnt, s->lensym, lengths, fix_l_codes);
 
   // distance table
-  for (symbol = 0; symbol < max_d_codes; symbol++) lengths[symbol] = 5;
+  for (symbol = 0; symbol < max_d_codes; symbol++)
+    lengths[symbol] = 5;
 
   // build huffman table for distance codes
   construct(s, s->distcnt, s->distsym, lengths, max_d_codes);
@@ -497,21 +506,21 @@ static const __device__ __constant__ uint16_t g_dext[30] = {  // Extra bits for
   0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13};
 
 /// @brief Thread 0 only: decode bitstreams and output symbols into the symbol queue
-__device__ void decode_symbols(inflate_state_s *s)
+__device__ void decode_symbols(inflate_state_s* s)
 {
   uint32_t bitpos = s->bitpos;
   uint2 bitbuf    = s->bitbuf;
-  uint8_t *cur    = s->cur;
-  uint8_t *end    = s->end;
+  uint8_t* cur    = s->cur;
+  uint8_t* end    = s->end;
   int32_t batch   = 0;
   int32_t sym, batch_len;
 
   do {
-    volatile uint32_t *b = &s->x.u.symqueue[batch * batch_size];
+    volatile uint32_t* b = &s->x.u.symqueue[batch * batch_size];
     // Wait for the next batch entry to be empty
 #if ENABLE_PREFETCH
     // Wait for prefetcher to fetch a worst-case of 48 bits per symbol
-    while ((*(volatile int32_t *)&s->pref.cur_p - (int32_t)(size_t)cur < batch_size * 6) ||
+    while ((*(volatile int32_t*)&s->pref.cur_p - (int32_t)(size_t)cur < batch_size * 6) ||
            (s->x.batch_len[batch] != 0)) {}
 #else
     while (s->x.batch_len[batch] != 0) {}
@@ -544,7 +553,7 @@ __device__ void decode_symbols(inflate_state_s *s)
       } else {
         // Slow length path
         uint32_t next32r       = __brev(next32);
-        const int16_t *symbols = &s->lensym[s->index_slow_len];
+        const int16_t* symbols = &s->lensym[s->index_slow_len];
         unsigned int first     = s->first_slow_len;
         int lext;
 #pragma unroll 1
@@ -583,7 +592,7 @@ __device__ void decode_symbols(inflate_state_s *s)
           cur += 4;
 #else
           cur += 8;
-          bitbuf.y = (cur < end) ? *(const uint32_t *)cur : 0;
+          bitbuf.y = (cur < end) ? *(const uint32_t*)cur : 0;
           cur -= 4;
 #endif
           bitpos &= 0x1f;
@@ -599,7 +608,7 @@ __device__ void decode_symbols(inflate_state_s *s)
           len += dext;
         } else {
           uint32_t next32r       = __brev(next32);
-          const int16_t *symbols = &s->distsym[s->index_slow_dist];
+          const int16_t* symbols = &s->distsym[s->index_slow_dist];
           unsigned int first     = s->first_slow_dist;
 #pragma unroll 1
           for (len = log2_dist_lut + 1; len <= max_bits; len++) {
@@ -636,7 +645,7 @@ __device__ void decode_symbols(inflate_state_s *s)
 #else
         cur += 8;
         if (cur < end) {
-          bitbuf.y = *(const uint32_t *)cur;
+          bitbuf.y = *(const uint32_t*)cur;
           cur -= 4;
         } else {
           bitbuf.y = 0;
@@ -654,7 +663,7 @@ __device__ void decode_symbols(inflate_state_s *s)
     } while (batch_len < batch_size - 1);
     s->x.batch_len[batch] = batch_len;
 #if ENABLE_PREFETCH
-    ((volatile inflate_state_s *)s)->cur = cur;
+    ((volatile inflate_state_s*)s)->cur = cur;
 #endif
     if (batch_len != 0) batch = (batch + 1) & (batch_count - 1);
   } while (sym != 256);
@@ -672,13 +681,13 @@ __device__ void decode_symbols(inflate_state_s *s)
  * @brief Build lookup tables for faster decode
  * LUT format is symbols*16+length
  */
-__device__ void init_length_lut(inflate_state_s *s, int t)
+__device__ void init_length_lut(inflate_state_s* s, int t)
 {
-  int32_t *lut = s->u.lut.lenlut;
+  int32_t* lut = s->u.lut.lenlut;
 
   for (uint32_t bits = t; bits < (1 << log2_len_lut); bits += blockDim.x) {
-    const int16_t *cnt     = s->lencnt;
-    const int16_t *symbols = s->lensym;
+    const int16_t* cnt     = s->lencnt;
+    const int16_t* symbols = s->lensym;
     int sym                = -10 << 5;
     unsigned int first     = 0;
     unsigned int rbits     = __brev(bits) >> (32 - log2_len_lut);
@@ -704,7 +713,7 @@ __device__ void init_length_lut(inflate_state_s *s, int t)
   if (!t) {
     unsigned int first = 0;
     unsigned int index = 0;
-    const int16_t *cnt = s->lencnt;
+    const int16_t* cnt = s->lencnt;
     for (unsigned int len = 1; len <= log2_len_lut; len++) {
       unsigned int count = cnt[len];
       index += count;
@@ -720,13 +729,13 @@ __device__ void init_length_lut(inflate_state_s *s, int t)
  * @brief Build lookup tables for faster decode of distance symbol
  * LUT format is symbols*16+length
  */
-__device__ void init_distance_lut(inflate_state_s *s, int t)
+__device__ void init_distance_lut(inflate_state_s* s, int t)
 {
-  int32_t *lut = s->u.lut.distlut;
+  int32_t* lut = s->u.lut.distlut;
 
   for (uint32_t bits = t; bits < (1 << log2_dist_lut); bits += blockDim.x) {
-    const int16_t *cnt     = s->distcnt;
-    const int16_t *symbols = s->distsym;
+    const int16_t* cnt     = s->distcnt;
+    const int16_t* symbols = s->distsym;
     int sym                = 0;
     unsigned int first     = 0;
     unsigned int rbits     = __brev(bits) >> (32 - log2_dist_lut);
@@ -749,7 +758,7 @@ __device__ void init_distance_lut(inflate_state_s *s, int t)
   if (!t) {
     unsigned int first = 0;
     unsigned int index = 0;
-    const int16_t *cnt = s->distcnt;
+    const int16_t* cnt = s->distcnt;
     for (unsigned int len = 1; len <= log2_dist_lut; len++) {
       unsigned int count = cnt[len];
       index += count;
@@ -762,15 +771,15 @@ __device__ void init_distance_lut(inflate_state_s *s, int t)
 }
 
 /// @brief WARP1: process symbols and output uncompressed stream
-__device__ void process_symbols(inflate_state_s *s, int t)
+__device__ void process_symbols(inflate_state_s* s, int t)
 {
-  uint8_t *out           = s->out;
-  const uint8_t *outend  = s->outend;
-  const uint8_t *outbase = s->outbase;
+  uint8_t* out           = s->out;
+  const uint8_t* outend  = s->outend;
+  const uint8_t* outbase = s->outbase;
   int batch              = 0;
 
   do {
-    volatile uint32_t *b = &s->x.u.symqueue[batch * batch_size];
+    volatile uint32_t* b = &s->x.u.symqueue[batch * batch_size];
     int batch_len, pos;
     int32_t symt;
     uint32_t lit_mask;
@@ -798,7 +807,7 @@ __device__ void process_symbols(inflate_state_s *s, int t)
       len    = max((symbol & 0xffff) - 256, 0);  // max should be unnecessary, but just in case
       dist   = symbol >> 16;
       for (int i = t; i < len; i += 32) {
-        const uint8_t *src = out + ((i >= dist) ? (i % dist) : i) - dist;
+        const uint8_t* src = out + ((i >= dist) ? (i % dist) : i) - dist;
         uint8_t b          = (src < outbase) ? 0 : *src;
         if (out + i < outend) { out[i] = b; }
       }
@@ -838,7 +847,7 @@ __device__ void process_symbols(inflate_state_s *s, int t)
  * - A stored block can have zero length.  This is sometimes used to byte-align
  *   subsets of the compressed data for random access or partial recovery.
  */
-__device__ int init_stored(inflate_state_s *s)
+__device__ int init_stored(inflate_state_s* s)
 {
   uint32_t len, nlen;  // length of stored block
 
@@ -863,13 +872,13 @@ __device__ int init_stored(inflate_state_s *s)
 }
 
 /// Copy bytes from stored block to destination
-__device__ void copy_stored(inflate_state_s *s, int t)
+__device__ void copy_stored(inflate_state_s* s, int t)
 {
   int len         = s->stored_blk_len;
-  uint8_t *cur    = s->cur + (s->bitpos >> 3);
-  uint8_t *out    = s->out;
-  uint8_t *outend = s->outend;
-  uint8_t *cur4;
+  uint8_t* cur    = s->cur + (s->bitpos >> 3);
+  uint8_t* out    = s->out;
+  uint8_t* outend = s->outend;
+  uint8_t* cur4;
   int slow_bytes = min(len, (int)((16 - (size_t)out) & 0xf));
   int fast_bytes, bitpos;
 
@@ -893,18 +902,18 @@ __device__ void copy_stored(inflate_state_s *s, int t)
     // Fast copy 16 bytes at a time
     for (int i = t * 16; i < fast_bytes; i += blockDim.x * 16) {
       uint4 u;
-      u.x = *reinterpret_cast<const uint32_t *>(cur4 + i + 0 * 4);
-      u.y = *reinterpret_cast<const uint32_t *>(cur4 + i + 1 * 4);
-      u.z = *reinterpret_cast<const uint32_t *>(cur4 + i + 2 * 4);
-      u.w = *reinterpret_cast<const uint32_t *>(cur4 + i + 3 * 4);
+      u.x = *reinterpret_cast<const uint32_t*>(cur4 + i + 0 * 4);
+      u.y = *reinterpret_cast<const uint32_t*>(cur4 + i + 1 * 4);
+      u.z = *reinterpret_cast<const uint32_t*>(cur4 + i + 2 * 4);
+      u.w = *reinterpret_cast<const uint32_t*>(cur4 + i + 3 * 4);
       if (bitpos != 0) {
-        uint32_t v = (bitpos != 0) ? *reinterpret_cast<const uint32_t *>(cur4 + i + 4 * 4) : 0;
+        uint32_t v = (bitpos != 0) ? *reinterpret_cast<const uint32_t*>(cur4 + i + 4 * 4) : 0;
         u.x        = __funnelshift_rc(u.x, u.y, bitpos);
         u.y        = __funnelshift_rc(u.y, u.z, bitpos);
         u.z        = __funnelshift_rc(u.z, u.w, bitpos);
         u.w        = __funnelshift_rc(u.w, v, bitpos);
       }
-      *reinterpret_cast<uint4 *>(out + i) = u;
+      *reinterpret_cast<uint4*>(out + i) = u;
     }
   }
   cur += fast_bytes;
@@ -920,20 +929,20 @@ __device__ void copy_stored(inflate_state_s *s, int t)
   __syncthreads();
   if (t == 0) {
     // Reset bitstream to end of block
-    uint8_t *p            = cur + len;
+    uint8_t* p            = cur + len;
     uint32_t prefix_bytes = (uint32_t)(((size_t)p) & 3);
     p -= prefix_bytes;
     s->cur      = p;
-    s->bitbuf.x = (p < s->end) ? *reinterpret_cast<uint32_t *>(p) : 0;
+    s->bitbuf.x = (p < s->end) ? *reinterpret_cast<uint32_t*>(p) : 0;
     p += 4;
-    s->bitbuf.y = (p < s->end) ? *reinterpret_cast<uint32_t *>(p) : 0;
+    s->bitbuf.y = (p < s->end) ? *reinterpret_cast<uint32_t*>(p) : 0;
     s->bitpos   = prefix_bytes * 8;
     s->out      = out;
   }
 }
 
 #if ENABLE_PREFETCH
-__device__ void init_prefetcher(inflate_state_s *s, int t)
+__device__ void init_prefetcher(inflate_state_s* s, int t)
 {
   if (t == 0) {
     s->pref.cur_p = s->cur;
@@ -941,17 +950,17 @@ __device__ void init_prefetcher(inflate_state_s *s, int t)
   }
 }
 
-__device__ void prefetch_warp(volatile inflate_state_s *s, int t)
+__device__ void prefetch_warp(volatile inflate_state_s* s, int t)
 {
-  const uint8_t *cur_p = s->pref.cur_p;
-  const uint8_t *end   = s->end;
+  const uint8_t* cur_p = s->pref.cur_p;
+  const uint8_t* end   = s->end;
   while (shuffle((t == 0) ? s->pref.run : 0)) {
     int32_t cur_lo = (int32_t)(size_t)cur_p;
     int do_pref =
-      shuffle((t == 0) ? (cur_lo - *(volatile int32_t *)&s->cur < prefetch_size - 32 * 4 - 4) : 0);
+      shuffle((t == 0) ? (cur_lo - *(volatile int32_t*)&s->cur < prefetch_size - 32 * 4 - 4) : 0);
     if (do_pref) {
-      const uint8_t *p             = cur_p + 4 * t;
-      *prefetch_addr32(s->pref, p) = (p < end) ? *reinterpret_cast<const uint32_t *>(p) : 0;
+      const uint8_t* p             = cur_p + 4 * t;
+      *prefetch_addr32(s->pref, p) = (p < end) ? *reinterpret_cast<const uint32_t*>(p) : 0;
       cur_p += 4 * 32;
       __threadfence_block();
       __syncwarp();
@@ -968,7 +977,7 @@ __device__ void prefetch_warp(volatile inflate_state_s *s, int t)
  * @brief Parse GZIP header
  * See https://tools.ietf.org/html/rfc1952
  */
-__device__ int parse_gzip_header(const uint8_t *src, size_t src_size)
+__device__ int parse_gzip_header(const uint8_t* src, size_t src_size)
 {
   int hdr_len = -1;
 
@@ -1020,16 +1029,16 @@ __device__ int parse_gzip_header(const uint8_t *src, size_t src_size)
  */
 template <int block_size>
 __global__ void __launch_bounds__(block_size)
-  inflate_kernel(gpu_inflate_input_s *inputs, gpu_inflate_status_s *outputs, int parse_hdr)
+  inflate_kernel(gpu_inflate_input_s* inputs, gpu_inflate_status_s* outputs, int parse_hdr)
 {
   __shared__ __align__(16) inflate_state_s state_g;
 
   int t                  = threadIdx.x;
   int z                  = blockIdx.x;
-  inflate_state_s *state = &state_g;
+  inflate_state_s* state = &state_g;
 
   if (!t) {
-    uint8_t *p      = const_cast<uint8_t *>(static_cast<uint8_t const *>(inputs[z].srcDevice));
+    uint8_t* p      = const_cast<uint8_t*>(static_cast<uint8_t const*>(inputs[z].srcDevice));
     size_t src_size = inputs[z].srcSize;
     uint32_t prefix_bytes;
     // Parse header if needed
@@ -1045,16 +1054,16 @@ __global__ void __launch_bounds__(block_size)
       }
     }
     // Initialize shared state
-    state->out     = const_cast<uint8_t *>(static_cast<uint8_t const *>(inputs[z].dstDevice));
+    state->out     = const_cast<uint8_t*>(static_cast<uint8_t const*>(inputs[z].dstDevice));
     state->outbase = state->out;
     state->outend  = state->out + inputs[z].dstSize;
     state->end     = p + src_size;
     prefix_bytes   = (uint32_t)(((size_t)p) & 3);
     p -= prefix_bytes;
     state->cur      = p;
-    state->bitbuf.x = (p < state->end) ? *reinterpret_cast<uint32_t *>(p) : 0;
+    state->bitbuf.x = (p < state->end) ? *reinterpret_cast<uint32_t*>(p) : 0;
     p += 4;
-    state->bitbuf.y = (p < state->end) ? *reinterpret_cast<uint32_t *>(p) : 0;
+    state->bitbuf.y = (p < state->end) ? *reinterpret_cast<uint32_t*>(p) : 0;
     state->bitpos   = prefix_bytes * 8;
   }
   __syncthreads();
@@ -1139,21 +1148,21 @@ __global__ void __launch_bounds__(block_size)
  *
  * @param inputs Source and destination information per block
  */
-__global__ void __launch_bounds__(1024) copy_uncompressed_kernel(gpu_inflate_input_s *inputs)
+__global__ void __launch_bounds__(1024) copy_uncompressed_kernel(gpu_inflate_input_s* inputs)
 {
-  __shared__ const uint8_t *volatile src_g;
-  __shared__ uint8_t *volatile dst_g;
+  __shared__ const uint8_t* volatile src_g;
+  __shared__ uint8_t* volatile dst_g;
   __shared__ uint32_t volatile copy_len_g;
 
   uint32_t t = threadIdx.x;
   uint32_t z = blockIdx.x;
-  const uint8_t *src;
-  uint8_t *dst;
+  const uint8_t* src;
+  uint8_t* dst;
   uint32_t len, src_align_bytes, src_align_bits, dst_align_bytes;
 
   if (!t) {
-    src        = static_cast<const uint8_t *>(inputs[z].srcDevice);
-    dst        = static_cast<uint8_t *>(inputs[z].dstDevice);
+    src        = static_cast<const uint8_t*>(inputs[z].srcDevice);
+    dst        = static_cast<uint8_t*>(inputs[z].dstDevice);
     len        = min((uint32_t)inputs[z].srcSize, (uint32_t)inputs[z].dstSize);
     src_g      = src;
     dst_g      = dst;
@@ -1175,12 +1184,12 @@ __global__ void __launch_bounds__(1024) copy_uncompressed_kernel(gpu_inflate_inp
   src_align_bytes = (uint32_t)(3 & reinterpret_cast<uintptr_t>(src));
   src_align_bits  = src_align_bytes << 3;
   while (len >= 32) {
-    const uint32_t *src32 = reinterpret_cast<const uint32_t *>(src - src_align_bytes);
+    const uint32_t* src32 = reinterpret_cast<const uint32_t*>(src - src_align_bytes);
     uint32_t copy_cnt     = min(len >> 2, 1024);
     if (t < copy_cnt) {
       uint32_t v = src32[t];
       if (src_align_bits != 0) { v = __funnelshift_r(v, src32[t + 1], src_align_bits); }
-      reinterpret_cast<uint32_t *>(dst)[t] = v;
+      reinterpret_cast<uint32_t*>(dst)[t] = v;
     }
     src += copy_cnt * 4;
     dst += copy_cnt * 4;
@@ -1189,8 +1198,8 @@ __global__ void __launch_bounds__(1024) copy_uncompressed_kernel(gpu_inflate_inp
   if (t < len) { dst[t] = src[t]; }
 }
 
-cudaError_t __host__ gpuinflate(gpu_inflate_input_s *inputs,
-                                gpu_inflate_status_s *outputs,
+cudaError_t __host__ gpuinflate(gpu_inflate_input_s* inputs,
+                                gpu_inflate_status_s* outputs,
                                 int count,
                                 int parse_hdr,
                                 rmm::cuda_stream_view stream)
@@ -1203,7 +1212,7 @@ cudaError_t __host__ gpuinflate(gpu_inflate_input_s *inputs,
   return cudaSuccess;
 }
 
-cudaError_t __host__ gpu_copy_uncompressed_blocks(gpu_inflate_input_s *inputs,
+cudaError_t __host__ gpu_copy_uncompressed_blocks(gpu_inflate_input_s* inputs,
                                                   int count,
                                                   rmm::cuda_stream_view stream)
 {
diff --git a/cpp/src/io/comp/gpuinflate.h b/cpp/src/io/comp/gpuinflate.h
index 7ca6dd13e9a..a37d282997e 100644
--- a/cpp/src/io/comp/gpuinflate.h
+++ b/cpp/src/io/comp/gpuinflate.h
@@ -26,9 +26,9 @@ namespace io {
  * @brief Input parameters for the decompression interface
  */
 struct gpu_inflate_input_s {
-  const void *srcDevice;
+  const void* srcDevice;
   uint64_t srcSize;
-  void *dstDevice;
+  void* dstDevice;
   uint64_t dstSize;
 };
 
@@ -53,8 +53,8 @@ struct gpu_inflate_status_s {
  * @param[in] parse_hdr Whether or not to parse GZIP header, default false
  * @param[in] stream CUDA stream to use, default 0
  */
-cudaError_t gpuinflate(gpu_inflate_input_s *inputs,
-                       gpu_inflate_status_s *outputs,
+cudaError_t gpuinflate(gpu_inflate_input_s* inputs,
+                       gpu_inflate_status_s* outputs,
                        int count                    = 1,
                        int parse_hdr                = 0,
                        rmm::cuda_stream_view stream = rmm::cuda_stream_default);
@@ -66,7 +66,7 @@ cudaError_t gpuinflate(gpu_inflate_input_s *inputs,
  * @param[in] count Number of input structures, default 1
  * @param[in] stream CUDA stream to use, default 0
  */
-cudaError_t gpu_copy_uncompressed_blocks(gpu_inflate_input_s *inputs,
+cudaError_t gpu_copy_uncompressed_blocks(gpu_inflate_input_s* inputs,
                                          int count                    = 1,
                                          rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
@@ -81,8 +81,8 @@ cudaError_t gpu_copy_uncompressed_blocks(gpu_inflate_input_s *inputs,
  * @param[in] count Number of input/output structures, default 1
  * @param[in] stream CUDA stream to use, default 0
  */
-cudaError_t gpu_unsnap(gpu_inflate_input_s *inputs,
-                       gpu_inflate_status_s *outputs,
+cudaError_t gpu_unsnap(gpu_inflate_input_s* inputs,
+                       gpu_inflate_status_s* outputs,
                        int count                    = 1,
                        rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
@@ -108,9 +108,9 @@ size_t get_gpu_debrotli_scratch_size(int max_num_inputs = 0);
  * @param[in] count Number of input/output structures, default 1
  * @param[in] stream CUDA stream to use, default 0
  */
-cudaError_t gpu_debrotli(gpu_inflate_input_s *inputs,
-                         gpu_inflate_status_s *outputs,
-                         void *scratch,
+cudaError_t gpu_debrotli(gpu_inflate_input_s* inputs,
+                         gpu_inflate_status_s* outputs,
+                         void* scratch,
                          size_t scratch_size,
                          int count                    = 1,
                          rmm::cuda_stream_view stream = rmm::cuda_stream_default);
@@ -126,8 +126,8 @@ cudaError_t gpu_debrotli(gpu_inflate_input_s *inputs,
  * @param[in] count Number of input/output structures, default 1
  * @param[in] stream CUDA stream to use, default 0
  */
-cudaError_t gpu_snap(gpu_inflate_input_s *inputs,
-                     gpu_inflate_status_s *outputs,
+cudaError_t gpu_snap(gpu_inflate_input_s* inputs,
+                     gpu_inflate_status_s* outputs,
                      int count                    = 1,
                      rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
diff --git a/cpp/src/io/comp/snap.cu b/cpp/src/io/comp/snap.cu
index 999d02e3a50..a3d7bd048e8 100644
--- a/cpp/src/io/comp/snap.cu
+++ b/cpp/src/io/comp/snap.cu
@@ -31,11 +31,11 @@ constexpr int hash_bits = 12;
  * @brief snappy compressor state
  */
 struct snap_state_s {
-  const uint8_t *src;                 ///< Ptr to uncompressed data
+  const uint8_t* src;                 ///< Ptr to uncompressed data
   uint32_t src_len;                   ///< Uncompressed data length
-  uint8_t *dst_base;                  ///< Base ptr to output compressed data
-  uint8_t *dst;                       ///< Current ptr to uncompressed data
-  uint8_t *end;                       ///< End of uncompressed data buffer
+  uint8_t* dst_base;                  ///< Base ptr to output compressed data
+  uint8_t* dst;                       ///< Current ptr to uncompressed data
+  uint8_t* end;                       ///< End of uncompressed data buffer
   volatile uint32_t literal_length;   ///< Number of literal bytes
   volatile uint32_t copy_length;      ///< Number of copy bytes
   volatile uint32_t copy_distance;    ///< Distance for copy bytes
@@ -53,10 +53,10 @@ static inline __device__ uint32_t snap_hash(uint32_t v)
 /**
  * @brief Fetches four consecutive bytes
  */
-static inline __device__ uint32_t fetch4(const uint8_t *src)
+static inline __device__ uint32_t fetch4(const uint8_t* src)
 {
   uint32_t src_align    = 3 & reinterpret_cast<uintptr_t>(src);
-  const uint32_t *src32 = reinterpret_cast<const uint32_t *>(src - src_align);
+  const uint32_t* src32 = reinterpret_cast<const uint32_t*>(src - src_align);
   uint32_t v            = src32[0];
   return (src_align) ? __funnelshift_r(v, src32[1], src_align * 8) : v;
 }
@@ -72,8 +72,8 @@ static inline __device__ uint32_t fetch4(const uint8_t *src)
  *
  * @return Updated pointer to compressed byte stream
  */
-static __device__ uint8_t *StoreLiterals(
-  uint8_t *dst, uint8_t *end, const uint8_t *src, uint32_t len_minus1, uint32_t t)
+static __device__ uint8_t* StoreLiterals(
+  uint8_t* dst, uint8_t* end, const uint8_t* src, uint32_t len_minus1, uint32_t t)
 {
   if (len_minus1 < 60) {
     if (!t && dst < end) dst[0] = (len_minus1 << 2);
@@ -125,8 +125,8 @@ static __device__ uint8_t *StoreLiterals(
  *
  * @return Updated pointer to compressed byte stream
  */
-static __device__ uint8_t *StoreCopy(uint8_t *dst,
-                                     uint8_t *end,
+static __device__ uint8_t* StoreCopy(uint8_t* dst,
+                                     uint8_t* end,
                                      uint32_t copy_len,
                                      uint32_t distance)
 {
@@ -178,8 +178,8 @@ static inline __device__ uint32_t HashMatchAny(uint32_t v, uint32_t t)
  *
  * @return Number of bytes before first match (literal length)
  */
-static __device__ uint32_t FindFourByteMatch(snap_state_s *s,
-                                             const uint8_t *src,
+static __device__ uint32_t FindFourByteMatch(snap_state_s* s,
+                                             const uint8_t* src,
                                              uint32_t pos0,
                                              uint32_t t)
 {
@@ -233,8 +233,8 @@ static __device__ uint32_t FindFourByteMatch(snap_state_s *s,
 }
 
 /// @brief Returns the number of matching bytes for two byte sequences up to 63 bytes
-static __device__ uint32_t Match60(const uint8_t *src1,
-                                   const uint8_t *src2,
+static __device__ uint32_t Match60(const uint8_t* src1,
+                                   const uint8_t* src2,
                                    uint32_t len,
                                    uint32_t t)
 {
@@ -258,21 +258,21 @@ static __device__ uint32_t Match60(const uint8_t *src1,
  * @param[in] count Number of blocks to compress
  */
 extern "C" __global__ void __launch_bounds__(128)
-  snap_kernel(gpu_inflate_input_s *inputs, gpu_inflate_status_s *outputs, int count)
+  snap_kernel(gpu_inflate_input_s* inputs, gpu_inflate_status_s* outputs, int count)
 {
   __shared__ __align__(16) snap_state_s state_g;
 
-  snap_state_s *const s = &state_g;
+  snap_state_s* const s = &state_g;
   uint32_t t            = threadIdx.x;
   uint32_t pos;
-  const uint8_t *src;
+  const uint8_t* src;
 
   if (!t) {
-    const uint8_t *src = static_cast<const uint8_t *>(inputs[blockIdx.x].srcDevice);
+    const uint8_t* src = static_cast<const uint8_t*>(inputs[blockIdx.x].srcDevice);
     uint32_t src_len   = static_cast<uint32_t>(inputs[blockIdx.x].srcSize);
-    uint8_t *dst       = static_cast<uint8_t *>(inputs[blockIdx.x].dstDevice);
+    uint8_t* dst       = static_cast<uint8_t*>(inputs[blockIdx.x].dstDevice);
     uint32_t dst_len   = static_cast<uint32_t>(inputs[blockIdx.x].dstSize);
-    uint8_t *end       = dst + dst_len;
+    uint8_t* end       = dst + dst_len;
     s->src             = src;
     s->src_len         = src_len;
     s->dst_base        = dst;
@@ -289,7 +289,7 @@ extern "C" __global__ void __launch_bounds__(128)
     s->copy_distance  = 0;
   }
   for (uint32_t i = t; i < sizeof(s->hash_map) / sizeof(uint32_t); i += 128) {
-    *reinterpret_cast<volatile uint32_t *>(&s->hash_map[i * 2]) = 0;
+    *reinterpret_cast<volatile uint32_t*>(&s->hash_map[i * 2]) = 0;
   }
   __syncthreads();
   src = s->src;
@@ -301,8 +301,8 @@ extern "C" __global__ void __launch_bounds__(128)
     __syncthreads();
     if (t < 32) {
       // WARP0: Encode literals and copies
-      uint8_t *dst = s->dst;
-      uint8_t *end = s->end;
+      uint8_t* dst = s->dst;
+      uint8_t* end = s->end;
       if (literal_len > 0) {
         dst = StoreLiterals(dst, end, src + pos, literal_len - 1, t);
         pos += literal_len;
@@ -341,8 +341,8 @@ extern "C" __global__ void __launch_bounds__(128)
   }
 }
 
-cudaError_t __host__ gpu_snap(gpu_inflate_input_s *inputs,
-                              gpu_inflate_status_s *outputs,
+cudaError_t __host__ gpu_snap(gpu_inflate_input_s* inputs,
+                              gpu_inflate_status_s* outputs,
                               int count,
                               rmm::cuda_stream_view stream)
 {
diff --git a/cpp/src/io/comp/unbz2.h b/cpp/src/io/comp/unbz2.h
index 8f3a6eace5a..5731db63757 100644
--- a/cpp/src/io/comp/unbz2.h
+++ b/cpp/src/io/comp/unbz2.h
@@ -82,25 +82,25 @@ namespace io {
 // If BZ_OUTBUFF_FULL is returned and block_start is non-NULL, dstlen will be updated to point to
 // the end of the last valid block, and block_start will contain the offset in bits of the beginning
 // of the block, so it can be passed in to resume decoding later on.
-#define BZ_OK 0
-#define BZ_RUN_OK 1
-#define BZ_FLUSH_OK 2
-#define BZ_FINISH_OK 3
-#define BZ_STREAM_END 4
-#define BZ_SEQUENCE_ERROR (-1)
-#define BZ_PARAM_ERROR (-2)
-#define BZ_MEM_ERROR (-3)
-#define BZ_DATA_ERROR (-4)
+#define BZ_OK               0
+#define BZ_RUN_OK           1
+#define BZ_FLUSH_OK         2
+#define BZ_FINISH_OK        3
+#define BZ_STREAM_END       4
+#define BZ_SEQUENCE_ERROR   (-1)
+#define BZ_PARAM_ERROR      (-2)
+#define BZ_MEM_ERROR        (-3)
+#define BZ_DATA_ERROR       (-4)
 #define BZ_DATA_ERROR_MAGIC (-5)
-#define BZ_IO_ERROR (-6)
-#define BZ_UNEXPECTED_EOF (-7)
-#define BZ_OUTBUFF_FULL (-8)
+#define BZ_IO_ERROR         (-6)
+#define BZ_UNEXPECTED_EOF   (-7)
+#define BZ_OUTBUFF_FULL     (-8)
 
-int32_t cpu_bz2_uncompress(const uint8_t *input,
+int32_t cpu_bz2_uncompress(const uint8_t* input,
                            size_t inlen,
-                           uint8_t *dst,
-                           size_t *dstlen,
-                           uint64_t *block_start = nullptr);
+                           uint8_t* dst,
+                           size_t* dstlen,
+                           uint64_t* block_start = nullptr);
 
 }  // namespace io
 }  // namespace cudf
diff --git a/cpp/src/io/comp/uncomp.cpp b/cpp/src/io/comp/uncomp.cpp
index 44581bbc184..2cb99d897fe 100644
--- a/cpp/src/io/comp/uncomp.cpp
+++ b/cpp/src/io/comp/uncomp.cpp
@@ -106,32 +106,32 @@ struct bz2_file_header_s {
 #pragma pack(pop)
 
 struct gz_archive_s {
-  const gz_file_header_s *fhdr;
+  const gz_file_header_s* fhdr;
   uint16_t hcrc16;  // header crc16 if present
   uint16_t xlen;
-  const uint8_t *fxtra;      // xlen bytes (optional)
-  const uint8_t *fname;      // zero-terminated original filename if present
-  const uint8_t *fcomment;   // zero-terminated comment if present
-  const uint8_t *comp_data;  // compressed data
+  const uint8_t* fxtra;      // xlen bytes (optional)
+  const uint8_t* fname;      // zero-terminated original filename if present
+  const uint8_t* fcomment;   // zero-terminated comment if present
+  const uint8_t* comp_data;  // compressed data
   size_t comp_len;           // Compressed data length
   uint32_t crc32;            // CRC32 of uncompressed data
   uint32_t isize;            // Input size modulo 2^32
 };
 
 struct zip_archive_s {
-  const zip_eocd_s *eocd;    // end of central directory
-  const zip64_eocdl *eocdl;  // end of central dir locator (optional)
-  const zip_cdfh_s *cdfh;    // start of central directory file headers
+  const zip_eocd_s* eocd;    // end of central directory
+  const zip64_eocdl* eocdl;  // end of central dir locator (optional)
+  const zip_cdfh_s* cdfh;    // start of central directory file headers
 };
 
-bool ParseGZArchive(gz_archive_s *dst, const uint8_t *raw, size_t len)
+bool ParseGZArchive(gz_archive_s* dst, const uint8_t* raw, size_t len)
 {
-  const gz_file_header_s *fhdr;
+  const gz_file_header_s* fhdr;
 
   if (!dst) return false;
   memset(dst, 0, sizeof(gz_archive_s));
   if (len < sizeof(gz_file_header_s) + 8) return false;
-  fhdr = reinterpret_cast<gz_file_header_s const *>(raw);
+  fhdr = reinterpret_cast<gz_file_header_s const*>(raw);
   if (fhdr->id1 != 0x1f || fhdr->id2 != 0x8b) return false;
   dst->fhdr = fhdr;
   raw += sizeof(gz_file_header_s);
@@ -188,7 +188,7 @@ bool ParseGZArchive(gz_archive_s *dst, const uint8_t *raw, size_t len)
   return (fhdr->comp_mthd == 8 && len > 0);
 }
 
-bool OpenZipArchive(zip_archive_s *dst, const uint8_t *raw, size_t len)
+bool OpenZipArchive(zip_archive_s* dst, const uint8_t* raw, size_t len)
 {
   memset(dst, 0, sizeof(zip_archive_s));
   // Find the end of central directory
@@ -196,17 +196,17 @@ bool OpenZipArchive(zip_archive_s *dst, const uint8_t *raw, size_t len)
     for (ptrdiff_t i = len - sizeof(zip_eocd_s) - 2;
          i + sizeof(zip_eocd_s) + 2 + 0xffff >= len && i >= 0;
          i--) {
-      const zip_eocd_s *eocd = reinterpret_cast<zip_eocd_s const *>(raw + i);
+      const zip_eocd_s* eocd = reinterpret_cast<zip_eocd_s const*>(raw + i);
       if (eocd->sig == 0x06054b50 &&
           eocd->disk_id == eocd->start_disk  // multi-file archives not supported
           && eocd->num_entries == eocd->total_entries &&
           eocd->cdir_size >= sizeof(zip_cdfh_s) * eocd->num_entries && eocd->cdir_offset < len &&
-          i + *reinterpret_cast<const uint16_t *>(eocd + 1) <= static_cast<ptrdiff_t>(len)) {
-        const zip_cdfh_s *cdfh = reinterpret_cast<const zip_cdfh_s *>(raw + eocd->cdir_offset);
+          i + *reinterpret_cast<const uint16_t*>(eocd + 1) <= static_cast<ptrdiff_t>(len)) {
+        const zip_cdfh_s* cdfh = reinterpret_cast<const zip_cdfh_s*>(raw + eocd->cdir_offset);
         dst->eocd              = eocd;
         if (i >= static_cast<ptrdiff_t>(sizeof(zip64_eocdl))) {
-          const zip64_eocdl *eocdl =
-            reinterpret_cast<const zip64_eocdl *>(raw + i - sizeof(zip64_eocdl));
+          const zip64_eocdl* eocdl =
+            reinterpret_cast<const zip64_eocdl*>(raw + i - sizeof(zip64_eocdl));
           if (eocdl->sig == 0x07064b50) { dst->eocdl = eocdl; }
         }
         // Start of central directory
@@ -217,13 +217,13 @@ bool OpenZipArchive(zip_archive_s *dst, const uint8_t *raw, size_t len)
   return (dst->eocd && dst->cdfh);
 }
 
-int cpu_inflate(uint8_t *uncomp_data, size_t *destLen, const uint8_t *comp_data, size_t comp_len)
+int cpu_inflate(uint8_t* uncomp_data, size_t* destLen, const uint8_t* comp_data, size_t comp_len)
 {
   int zerr;
   z_stream strm;
 
   memset(&strm, 0, sizeof(strm));
-  strm.next_in   = const_cast<Bytef *>(reinterpret_cast<Bytef const *>(comp_data));
+  strm.next_in   = const_cast<Bytef*>(reinterpret_cast<Bytef const*>(comp_data));
   strm.avail_in  = comp_len;
   strm.total_in  = 0;
   strm.next_out  = uncomp_data;
@@ -250,16 +250,16 @@ int cpu_inflate(uint8_t *uncomp_data, size_t *destLen, const uint8_t *comp_data,
  * @param comp_data[in] Raw compressed data
  * @param comp_len[in] Compressed data size
  */
-int cpu_inflate_vector(std::vector<char> &dst, const uint8_t *comp_data, size_t comp_len)
+int cpu_inflate_vector(std::vector<char>& dst, const uint8_t* comp_data, size_t comp_len)
 {
   int zerr;
   z_stream strm;
 
   memset(&strm, 0, sizeof(strm));
-  strm.next_in   = const_cast<Bytef *>(reinterpret_cast<Bytef const *>(comp_data));
+  strm.next_in   = const_cast<Bytef*>(reinterpret_cast<Bytef const*>(comp_data));
   strm.avail_in  = comp_len;
   strm.total_in  = 0;
-  strm.next_out  = reinterpret_cast<uint8_t *>(dst.data());
+  strm.next_out  = reinterpret_cast<uint8_t*>(dst.data());
   strm.avail_out = dst.size();
   strm.total_out = 0;
   zerr           = inflateInit2(&strm, -15);  // -15 for raw data without GZIP headers
@@ -271,7 +271,7 @@ int cpu_inflate_vector(std::vector<char> &dst, const uint8_t *comp_data, size_t
     if (strm.avail_out == 0) {
       dst.resize(strm.total_out + (1 << 30));
       strm.avail_out = dst.size() - strm.total_out;
-      strm.next_out  = reinterpret_cast<uint8_t *>(dst.data()) + strm.total_out;
+      strm.next_out  = reinterpret_cast<uint8_t*>(dst.data()) + strm.total_out;
     }
     zerr = inflate(&strm, Z_SYNC_FLUSH);
   } while ((zerr == Z_BUF_ERROR || zerr == Z_OK) && strm.avail_out == 0 &&
@@ -293,10 +293,10 @@ int cpu_inflate_vector(std::vector<char> &dst, const uint8_t *comp_data, size_t
  *
  * @return Vector containing the uncompressed output
  */
-std::vector<char> io_uncompress_single_h2d(const void *src, size_t src_size, int stream_type)
+std::vector<char> io_uncompress_single_h2d(const void* src, size_t src_size, int stream_type)
 {
-  const uint8_t *raw       = static_cast<const uint8_t *>(src);
-  const uint8_t *comp_data = nullptr;
+  const uint8_t* raw       = static_cast<const uint8_t*>(src);
+  const uint8_t* comp_data = nullptr;
   size_t comp_len          = 0;
   size_t uncomp_len        = 0;
 
@@ -320,8 +320,8 @@ std::vector<char> io_uncompress_single_h2d(const void *src, size_t src_size, int
       if (OpenZipArchive(&za, raw, src_size)) {
         size_t cdfh_ofs = 0;
         for (int i = 0; i < za.eocd->num_entries; i++) {
-          const zip_cdfh_s *cdfh = reinterpret_cast<const zip_cdfh_s *>(
-            reinterpret_cast<const uint8_t *>(za.cdfh) + cdfh_ofs);
+          const zip_cdfh_s* cdfh = reinterpret_cast<const zip_cdfh_s*>(
+            reinterpret_cast<const uint8_t*>(za.cdfh) + cdfh_ofs);
           int cdfh_len = sizeof(zip_cdfh_s) + cdfh->fname_len + cdfh->extra_len + cdfh->comment_len;
           if (cdfh_ofs + cdfh_len > za.eocd->cdir_size || cdfh->sig != 0x02014b50) {
             // Bad cdir
@@ -330,7 +330,7 @@ std::vector<char> io_uncompress_single_h2d(const void *src, size_t src_size, int
           // For now, only accept with non-zero file sizes and DEFLATE
           if (cdfh->comp_method == 8 && cdfh->comp_size > 0 && cdfh->uncomp_size > 0) {
             size_t lfh_ofs       = cdfh->hdr_ofs;
-            const zip_lfh_s *lfh = reinterpret_cast<const zip_lfh_s *>(raw + lfh_ofs);
+            const zip_lfh_s* lfh = reinterpret_cast<const zip_lfh_s*>(raw + lfh_ofs);
             if (lfh_ofs + sizeof(zip_lfh_s) <= src_size && lfh->sig == 0x04034b50 &&
                 lfh_ofs + sizeof(zip_lfh_s) + lfh->fname_len + lfh->extra_len <= src_size) {
               if (lfh->comp_method == 8 && lfh->comp_size > 0 && lfh->uncomp_size > 0) {
@@ -354,7 +354,7 @@ std::vector<char> io_uncompress_single_h2d(const void *src, size_t src_size, int
       if (stream_type != IO_UNCOMP_STREAM_TYPE_INFER) break;  // Fall through for INFER
     case IO_UNCOMP_STREAM_TYPE_BZIP2:
       if (src_size > 4) {
-        const bz2_file_header_s *fhdr = reinterpret_cast<const bz2_file_header_s *>(raw);
+        const bz2_file_header_s* fhdr = reinterpret_cast<const bz2_file_header_s*>(raw);
         // Check for BZIP2 file signature "BZh1" to "BZh9"
         if (fhdr->sig[0] == 'B' && fhdr->sig[1] == 'Z' && fhdr->sig[2] == 'h' &&
             fhdr->blksz >= '1' && fhdr->blksz <= '9') {
@@ -392,7 +392,7 @@ std::vector<char> io_uncompress_single_h2d(const void *src, size_t src_size, int
     do {
       size_t dst_len = uncomp_len - dst_ofs;
       bz_err         = cpu_bz2_uncompress(
-        comp_data, comp_len, reinterpret_cast<uint8_t *>(dst.data()) + dst_ofs, &dst_len, &src_ofs);
+        comp_data, comp_len, reinterpret_cast<uint8_t*>(dst.data()) + dst_ofs, &dst_len, &src_ofs);
       if (bz_err == BZ_OUTBUFF_FULL) {
         // TBD: We could infer the compression ratio based on produced/consumed byte counts
         // in order to minimize realloc events and over-allocation
@@ -422,7 +422,7 @@ std::vector<char> io_uncompress_single_h2d(const void *src, size_t src_size, int
  * @return Vector containing the output uncompressed data
  */
 std::vector<char> get_uncompressed_data(host_span<char const> const data,
-                                        std::string const &compression)
+                                        std::string const& compression)
 {
   int comp_type = IO_UNCOMP_STREAM_TYPE_INFER;
   if (compression == "gzip")
@@ -443,9 +443,9 @@ std::vector<char> get_uncompressed_data(host_span<char const> const data,
 class HostDecompressor_ZLIB : public HostDecompressor {
  public:
   HostDecompressor_ZLIB(bool gz_hdr_) : gz_hdr(gz_hdr_) {}
-  size_t Decompress(uint8_t *dstBytes,
+  size_t Decompress(uint8_t* dstBytes,
                     size_t dstLen,
-                    const uint8_t *srcBytes,
+                    const uint8_t* srcBytes,
                     size_t srcLen) override
   {
     if (gz_hdr) {
@@ -471,14 +471,14 @@ class HostDecompressor_ZLIB : public HostDecompressor {
 class HostDecompressor_SNAPPY : public HostDecompressor {
  public:
   HostDecompressor_SNAPPY() {}
-  size_t Decompress(uint8_t *dstBytes,
+  size_t Decompress(uint8_t* dstBytes,
                     size_t dstLen,
-                    const uint8_t *srcBytes,
+                    const uint8_t* srcBytes,
                     size_t srcLen) override
   {
     uint32_t uncompressed_size, bytes_left, dst_pos;
-    const uint8_t *cur = srcBytes;
-    const uint8_t *end = srcBytes + srcLen;
+    const uint8_t* cur = srcBytes;
+    const uint8_t* end = srcBytes + srcLen;
 
     if (!dstBytes || srcLen < 1) { return 0; }
     // Read uncompressed length (varint)
@@ -510,12 +510,12 @@ class HostDecompressor_SNAPPY : public HostDecompressor {
         if (blen & 2) {
           // xxxxxx1x: copy with 6-bit length, 2-byte or 4-byte offset
           if (cur + 2 > end) break;
-          offset = *reinterpret_cast<const uint16_t *>(cur);
+          offset = *reinterpret_cast<const uint16_t*>(cur);
           cur += 2;
           if (blen & 1)  // 4-byte offset
           {
             if (cur + 2 > end) break;
-            offset |= (*reinterpret_cast<const uint16_t *>(cur)) << 16;
+            offset |= (*reinterpret_cast<const uint16_t*>(cur)) << 16;
             cur += 2;
           }
           blen = (blen >> 2) + 1;
diff --git a/cpp/src/io/comp/unsnap.cu b/cpp/src/io/comp/unsnap.cu
index f9d491b3cc8..5fe01735dac 100644
--- a/cpp/src/io/comp/unsnap.cu
+++ b/cpp/src/io/comp/unsnap.cu
@@ -64,8 +64,8 @@ struct unsnap_queue_s {
  * @brief snappy decompression state
  */
 struct unsnap_state_s {
-  const uint8_t *base;         ///< base ptr of compressed stream
-  const uint8_t *end;          ///< end of compressed stream
+  const uint8_t* base;         ///< base ptr of compressed stream
+  const uint8_t* end;          ///< end of compressed stream
   uint32_t uncompressed_size;  ///< uncompressed stream size
   uint32_t bytes_left;         ///< bytes to uncompressed remaining
   int32_t error;               ///< current error status
@@ -74,7 +74,7 @@ struct unsnap_state_s {
   gpu_inflate_input_s in;      ///< input parameters for current block
 };
 
-inline __device__ volatile uint8_t &byte_access(unsnap_state_s *s, uint32_t pos)
+inline __device__ volatile uint8_t& byte_access(unsnap_state_s* s, uint32_t pos)
 {
   return s->q.buf[pos & (prefetch_size - 1)];
 }
@@ -85,9 +85,9 @@ inline __device__ volatile uint8_t &byte_access(unsnap_state_s *s, uint32_t pos)
  * @param s decompression state
  * @param t warp lane id
  */
-__device__ void snappy_prefetch_bytestream(unsnap_state_s *s, int t)
+__device__ void snappy_prefetch_bytestream(unsnap_state_s* s, int t)
 {
-  const uint8_t *base  = s->base;
+  const uint8_t* base  = s->base;
   uint32_t end         = (uint32_t)(s->end - base);
   uint32_t align_bytes = (uint32_t)(0x20 - (0x1f & reinterpret_cast<uintptr_t>(base)));
   int32_t pos          = min(align_bytes, end);
@@ -275,7 +275,7 @@ inline __device__ uint32_t get_len5_mask(uint32_t v0, uint32_t v1)
  * @param s decompression state
  * @param t warp lane id
  */
-__device__ void snappy_decode_symbols(unsnap_state_s *s, uint32_t t)
+__device__ void snappy_decode_symbols(unsnap_state_s* s, uint32_t t)
 {
   uint32_t cur        = 0;
   uint32_t end        = static_cast<uint32_t>(s->end - s->base);
@@ -285,13 +285,15 @@ __device__ void snappy_decode_symbols(unsnap_state_s *s, uint32_t t)
 
   for (;;) {
     int32_t batch_len;
-    volatile unsnap_batch_s *b;
+    volatile unsnap_batch_s* b;
 
     // Wait for prefetcher
     if (t == 0) {
       s->q.prefetch_rdpos = cur;
 #pragma unroll(1)  // We don't want unrolling here
-      while (s->q.prefetch_wrpos < min(cur + 5 * batch_size, end)) { busy_wait(10); }
+      while (s->q.prefetch_wrpos < min(cur + 5 * batch_size, end)) {
+        busy_wait(10);
+      }
       b = &s->q.batch[batch * batch_size];
     }
     // Process small symbols in parallel: for data that does not get good compression,
@@ -315,17 +317,17 @@ __device__ void snappy_decode_symbols(unsnap_state_s *s, uint32_t t)
       is_long_sym    = ((b0 & ~4) != 0) && (((b0 + 1) & 2) == 0);
       short_sym_mask = ballot(is_long_sym);
       batch_len      = 0;
-      b = reinterpret_cast<volatile unsnap_batch_s *>(shuffle(reinterpret_cast<uintptr_t>(b)));
+      b = reinterpret_cast<volatile unsnap_batch_s*>(shuffle(reinterpret_cast<uintptr_t>(b)));
       if (!(short_sym_mask & 1)) {
         batch_len = shuffle((t == 0) ? (short_sym_mask) ? __ffs(short_sym_mask) - 1 : 32 : 0);
         if (batch_len != 0) {
           uint32_t blen = 0;
           int32_t ofs   = 0;
           if (t < batch_len) {
-            blen = (b0 & 1) ? ((b0 >> 2) & 7) + 4 : ((b0 >> 2) + 1);
-            ofs  = (b0 & 1) ? ((b0 & 0xe0) << 3) | byte_access(s, cur_t + 1)
-                           : (b0 & 2) ? byte_access(s, cur_t + 1) | (byte_access(s, cur_t + 2) << 8)
-                                      : -(int32_t)(cur_t + 1);
+            blen        = (b0 & 1) ? ((b0 >> 2) & 7) + 4 : ((b0 >> 2) + 1);
+            ofs         = (b0 & 1)   ? ((b0 & 0xe0) << 3) | byte_access(s, cur_t + 1)
+                          : (b0 & 2) ? byte_access(s, cur_t + 1) | (byte_access(s, cur_t + 2) << 8)
+                                     : -(int32_t)(cur_t + 1);
             b[t].len    = blen;
             b[t].offset = ofs;
             ofs += blen;  // for correct out-of-range detection below
@@ -368,11 +370,10 @@ __device__ void snappy_decode_symbols(unsnap_state_s *s, uint32_t t)
             uint32_t blen = 0;
             int32_t ofs   = 0;
             if (t < batch_add) {
-              blen = (b0 & 1) ? ((b0 >> 2) & 7) + 4 : ((b0 >> 2) + 1);
-              ofs  = (b0 & 1)
-                      ? ((b0 & 0xe0) << 3) | byte_access(s, cur_t + 1)
-                      : (b0 & 2) ? byte_access(s, cur_t + 1) | (byte_access(s, cur_t + 2) << 8)
-                                 : -(int32_t)(cur_t + 1);
+              blen                    = (b0 & 1) ? ((b0 >> 2) & 7) + 4 : ((b0 >> 2) + 1);
+              ofs                     = (b0 & 1) ? ((b0 & 0xe0) << 3) | byte_access(s, cur_t + 1)
+                                        : (b0 & 2) ? byte_access(s, cur_t + 1) | (byte_access(s, cur_t + 2) << 8)
+                                                   : -(int32_t)(cur_t + 1);
               b[batch_len + t].len    = blen;
               b[batch_len + t].offset = ofs;
               ofs += blen;  // for correct out-of-range detection below
@@ -451,7 +452,9 @@ __device__ void snappy_decode_symbols(unsnap_state_s *s, uint32_t t)
           // Wait for prefetcher
           s->q.prefetch_rdpos = cur;
 #pragma unroll(1)  // We don't want unrolling here
-          while (s->q.prefetch_wrpos < min(cur + 5 * batch_size, end)) { busy_wait(10); }
+          while (s->q.prefetch_wrpos < min(cur + 5 * batch_size, end)) {
+            busy_wait(10);
+          }
           dst_pos += blen;
           if (bytes_left < blen) break;
           bytes_left -= blen;
@@ -467,7 +470,9 @@ __device__ void snappy_decode_symbols(unsnap_state_s *s, uint32_t t)
     }
     batch_len = shuffle(batch_len);
     if (t == 0) {
-      while (s->q.batch_len[batch] != 0) { busy_wait(20); }
+      while (s->q.batch_len[batch] != 0) {
+        busy_wait(20);
+      }
     }
     if (batch_len != batch_size) { break; }
   }
@@ -489,18 +494,20 @@ __device__ void snappy_decode_symbols(unsnap_state_s *s, uint32_t t)
  *would result in out-of-bounds accesses)
  */
 template <typename Storage>
-__device__ void snappy_process_symbols(unsnap_state_s *s, int t, Storage &temp_storage)
+__device__ void snappy_process_symbols(unsnap_state_s* s, int t, Storage& temp_storage)
 {
-  const uint8_t *literal_base = s->base;
-  uint8_t *out                = static_cast<uint8_t *>(s->in.dstDevice);
+  const uint8_t* literal_base = s->base;
+  uint8_t* out                = static_cast<uint8_t*>(s->in.dstDevice);
   int batch                   = 0;
 
   do {
-    volatile unsnap_batch_s *b = &s->q.batch[batch * batch_size];
+    volatile unsnap_batch_s* b = &s->q.batch[batch * batch_size];
     int32_t batch_len, blen_t, dist_t;
 
     if (t == 0) {
-      while ((batch_len = s->q.batch_len[batch]) == 0) { busy_wait(20); }
+      while ((batch_len = s->q.batch_len[batch]) == 0) {
+        busy_wait(20);
+      }
     } else {
       batch_len = 0;
     }
@@ -529,7 +536,7 @@ __device__ void snappy_process_symbols(unsnap_state_s *s, int t, Storage &temp_s
           uint32_t tr  = t - shuffle(bofs - blen_t, it);
           int32_t dist = shuffle(dist_t, it);
           if (it < n) {
-            const uint8_t *src = (dist > 0) ? (out + t - dist) : (literal_base + tr - dist);
+            const uint8_t* src = (dist > 0) ? (out + t - dist) : (literal_base + tr - dist);
             out[t]             = *src;
           }
           out += shuffle(bofs, n - 1);
@@ -556,7 +563,7 @@ __device__ void snappy_process_symbols(unsnap_state_s *s, int t, Storage &temp_s
           }
           blen += blen2;
           if (t < blen) {
-            const uint8_t *src = (dist > 0) ? (out - d) : (literal_base - d);
+            const uint8_t* src = (dist > 0) ? (out - d) : (literal_base - d);
             out[t]             = src[t];
           }
           out += blen;
@@ -569,12 +576,12 @@ __device__ void snappy_process_symbols(unsnap_state_s *s, int t, Storage &temp_s
         uint8_t b0, b1;
         if (t < blen) {
           uint32_t pos       = t;
-          const uint8_t *src = out + ((pos >= dist) ? (pos % dist) : pos) - dist;
+          const uint8_t* src = out + ((pos >= dist) ? (pos % dist) : pos) - dist;
           b0                 = *src;
         }
         if (32 + t < blen) {
           uint32_t pos       = 32 + t;
-          const uint8_t *src = out + ((pos >= dist) ? (pos % dist) : pos) - dist;
+          const uint8_t* src = out + ((pos >= dist) ? (pos % dist) : pos) - dist;
           b1                 = *src;
         }
         if (t < blen) { out[t] = b0; }
@@ -616,24 +623,23 @@ __device__ void snappy_process_symbols(unsnap_state_s *s, int t, Storage &temp_s
  */
 template <int block_size>
 __global__ void __launch_bounds__(block_size)
-  unsnap_kernel(gpu_inflate_input_s *inputs, gpu_inflate_status_s *outputs)
+  unsnap_kernel(gpu_inflate_input_s* inputs, gpu_inflate_status_s* outputs)
 {
   __shared__ __align__(16) unsnap_state_s state_g;
   __shared__ cub::WarpReduce<uint32_t>::TempStorage temp_storage;
   int t             = threadIdx.x;
-  unsnap_state_s *s = &state_g;
+  unsnap_state_s* s = &state_g;
   int strm_id       = blockIdx.x;
 
   if (t < sizeof(gpu_inflate_input_s) / sizeof(uint32_t)) {
-    reinterpret_cast<uint32_t *>(&s->in)[t] =
-      reinterpret_cast<const uint32_t *>(&inputs[strm_id])[t];
+    reinterpret_cast<uint32_t*>(&s->in)[t] = reinterpret_cast<const uint32_t*>(&inputs[strm_id])[t];
     __threadfence_block();
   }
   if (t < batch_count) { s->q.batch_len[t] = 0; }
   __syncthreads();
   if (!t) {
-    const uint8_t *cur = static_cast<const uint8_t *>(s->in.srcDevice);
-    const uint8_t *end = cur + s->in.srcSize;
+    const uint8_t* cur = static_cast<const uint8_t*>(s->in.srcDevice);
+    const uint8_t* end = cur + s->in.srcSize;
     s->error           = 0;
     if (log_cyclecount) { s->tstart = clock(); }
     if (cur < end) {
@@ -700,8 +706,8 @@ __global__ void __launch_bounds__(block_size)
   }
 }
 
-cudaError_t __host__ gpu_unsnap(gpu_inflate_input_s *inputs,
-                                gpu_inflate_status_s *outputs,
+cudaError_t __host__ gpu_unsnap(gpu_inflate_input_s* inputs,
+                                gpu_inflate_status_s* outputs,
                                 int count,
                                 rmm::cuda_stream_view stream)
 {
diff --git a/cpp/src/io/csv/csv_gpu.cu b/cpp/src/io/csv/csv_gpu.cu
index a3da5383196..68ac67b900d 100644
--- a/cpp/src/io/csv/csv_gpu.cu
+++ b/cpp/src/io/csv/csv_gpu.cu
@@ -269,7 +269,7 @@ __global__ void __launch_bounds__(csvparse_block_dim)
           auto const is_negative = (*trimmed_field_range.first == '-');
           auto const data_begin =
             trimmed_field_range.first + (is_negative || (*trimmed_field_range.first == '+'));
-          cudf::size_type *ptr = cudf::io::gpu::infer_integral_field_counter(
+          cudf::size_type* ptr = cudf::io::gpu::infer_integral_field_counter(
             data_begin, data_begin + count_number, is_negative, d_column_data[actual_col]);
           atomicAdd(ptr, 1);
         } else if (is_floatingpoint(trimmed_field_len,
@@ -292,33 +292,33 @@ __global__ void __launch_bounds__(csvparse_block_dim)
 }
 
 template <typename T, int base>
-__inline__ __device__ T decode_value(char const *begin,
-                                     char const *end,
-                                     parse_options_view const &opts)
+__inline__ __device__ T decode_value(char const* begin,
+                                     char const* end,
+                                     parse_options_view const& opts)
 {
   return cudf::io::parse_numeric<T, base>(begin, end, opts);
 }
 
 template <typename T>
-__inline__ __device__ T decode_value(char const *begin,
-                                     char const *end,
-                                     parse_options_view const &opts)
+__inline__ __device__ T decode_value(char const* begin,
+                                     char const* end,
+                                     parse_options_view const& opts)
 {
   return cudf::io::parse_numeric<T>(begin, end, opts);
 }
 
 template <>
-__inline__ __device__ cudf::timestamp_D decode_value(char const *begin,
-                                                     char const *end,
-                                                     parse_options_view const &opts)
+__inline__ __device__ cudf::timestamp_D decode_value(char const* begin,
+                                                     char const* end,
+                                                     parse_options_view const& opts)
 {
   return timestamp_D{cudf::duration_D{to_date(begin, end, opts.dayfirst)}};
 }
 
 template <>
-__inline__ __device__ cudf::timestamp_s decode_value(char const *begin,
-                                                     char const *end,
-                                                     parse_options_view const &opts)
+__inline__ __device__ cudf::timestamp_s decode_value(char const* begin,
+                                                     char const* end,
+                                                     parse_options_view const& opts)
 {
   auto milli = to_date_time(begin, end, opts.dayfirst);
   if (milli == -1) {
@@ -329,9 +329,9 @@ __inline__ __device__ cudf::timestamp_s decode_value(char const *begin,
 }
 
 template <>
-__inline__ __device__ cudf::timestamp_ms decode_value(char const *begin,
-                                                      char const *end,
-                                                      parse_options_view const &opts)
+__inline__ __device__ cudf::timestamp_ms decode_value(char const* begin,
+                                                      char const* end,
+                                                      parse_options_view const& opts)
 {
   auto milli = to_date_time(begin, end, opts.dayfirst);
   if (milli == -1) {
@@ -342,9 +342,9 @@ __inline__ __device__ cudf::timestamp_ms decode_value(char const *begin,
 }
 
 template <>
-__inline__ __device__ cudf::timestamp_us decode_value(char const *begin,
-                                                      char const *end,
-                                                      parse_options_view const &opts)
+__inline__ __device__ cudf::timestamp_us decode_value(char const* begin,
+                                                      char const* end,
+                                                      parse_options_view const& opts)
 {
   auto milli = to_date_time(begin, end, opts.dayfirst);
   if (milli == -1) {
@@ -355,9 +355,9 @@ __inline__ __device__ cudf::timestamp_us decode_value(char const *begin,
 }
 
 template <>
-__inline__ __device__ cudf::timestamp_ns decode_value(char const *begin,
-                                                      char const *end,
-                                                      parse_options_view const &opts)
+__inline__ __device__ cudf::timestamp_ns decode_value(char const* begin,
+                                                      char const* end,
+                                                      parse_options_view const& opts)
 {
   auto milli = to_date_time(begin, end, opts.dayfirst);
   if (milli == -1) {
@@ -371,7 +371,7 @@ __inline__ __device__ cudf::timestamp_ns decode_value(char const *begin,
 #define DURATION_DECODE_VALUE(Type)                                     \
   template <>                                                           \
   __inline__ __device__ Type decode_value(                              \
-    const char *begin, const char *end, parse_options_view const &opts) \
+    const char* begin, const char* end, parse_options_view const& opts) \
   {                                                                     \
     return Type{to_time_delta<Type>(begin, end)};                       \
   }
@@ -385,18 +385,18 @@ DURATION_DECODE_VALUE(duration_ns)
 // The purpose of this is merely to allow compilation ONLY
 // TODO : make this work for csv
 template <>
-__inline__ __device__ cudf::string_view decode_value(char const *begin,
-                                                     char const *end,
-                                                     parse_options_view const &opts)
+__inline__ __device__ cudf::string_view decode_value(char const* begin,
+                                                     char const* end,
+                                                     parse_options_view const& opts)
 {
   return cudf::string_view{};
 }
 
 // The purpose of this is merely to allow compilation ONLY
 template <>
-__inline__ __device__ cudf::dictionary32 decode_value(char const *begin,
-                                                      char const *end,
-                                                      parse_options_view const &opts)
+__inline__ __device__ cudf::dictionary32 decode_value(char const* begin,
+                                                      char const* end,
+                                                      parse_options_view const& opts)
 {
   return cudf::dictionary32{};
 }
@@ -404,9 +404,9 @@ __inline__ __device__ cudf::dictionary32 decode_value(char const *begin,
 // The purpose of this is merely to allow compilation ONLY
 // TODO : make this work for csv
 template <>
-__inline__ __device__ cudf::list_view decode_value(char const *begin,
-                                                   char const *end,
-                                                   parse_options_view const &opts)
+__inline__ __device__ cudf::list_view decode_value(char const* begin,
+                                                   char const* end,
+                                                   parse_options_view const& opts)
 {
   return cudf::list_view{};
 }
@@ -414,9 +414,9 @@ __inline__ __device__ cudf::list_view decode_value(char const *begin,
 // The purpose of this is merely to allow compilation ONLY
 // TODO : make this work for csv
 template <>
-__inline__ __device__ cudf::struct_view decode_value(char const *begin,
-                                                     char const *end,
-                                                     parse_options_view const &opts)
+__inline__ __device__ cudf::struct_view decode_value(char const* begin,
+                                                     char const* end,
+                                                     parse_options_view const& opts)
 {
   return cudf::struct_view{};
 }
@@ -434,16 +434,16 @@ struct decode_op {
    */
   template <typename T,
             typename std::enable_if_t<std::is_integral_v<T> and !std::is_same_v<T, bool> and
-                                      !cudf::is_fixed_point<T>()> * = nullptr>
-  __host__ __device__ __forceinline__ bool operator()(void *out_buffer,
+                                      !cudf::is_fixed_point<T>()>* = nullptr>
+  __host__ __device__ __forceinline__ bool operator()(void* out_buffer,
                                                       size_t row,
                                                       const data_type,
-                                                      char const *begin,
-                                                      char const *end,
-                                                      parse_options_view const &opts,
+                                                      char const* begin,
+                                                      char const* end,
+                                                      parse_options_view const& opts,
                                                       column_parse::flags flags)
   {
-    static_cast<T *>(out_buffer)[row] = [&flags, &opts, begin, end]() -> T {
+    static_cast<T*>(out_buffer)[row] = [&flags, &opts, begin, end]() -> T {
       // Check for user-specified true/false values
       auto const field_len = static_cast<size_t>(end - begin);
       if (serialized_trie_contains(opts.trie_true, {begin, field_len})) { return 1; }
@@ -460,16 +460,16 @@ struct decode_op {
    *
    * @return bool Whether the parsed value is valid.
    */
-  template <typename T, typename std::enable_if_t<cudf::is_fixed_point<T>()> * = nullptr>
-  __host__ __device__ __forceinline__ bool operator()(void *out_buffer,
+  template <typename T, typename std::enable_if_t<cudf::is_fixed_point<T>()>* = nullptr>
+  __host__ __device__ __forceinline__ bool operator()(void* out_buffer,
                                                       size_t row,
                                                       const data_type output_type,
-                                                      char const *begin,
-                                                      char const *end,
-                                                      parse_options_view const &opts,
+                                                      char const* begin,
+                                                      char const* end,
+                                                      parse_options_view const& opts,
                                                       column_parse::flags flags)
   {
-    static_cast<device_storage_type_t<T> *>(out_buffer)[row] =
+    static_cast<device_storage_type_t<T>*>(out_buffer)[row] =
       [&flags, &opts, output_type, begin, end]() -> device_storage_type_t<T> {
       return strings::detail::parse_decimal<device_storage_type_t<T>>(
         begin, end, output_type.scale());
@@ -481,16 +481,16 @@ struct decode_op {
   /**
    * @brief Dispatch for boolean type types.
    */
-  template <typename T, typename std::enable_if_t<std::is_same_v<T, bool>> * = nullptr>
-  __host__ __device__ __forceinline__ bool operator()(void *out_buffer,
+  template <typename T, typename std::enable_if_t<std::is_same_v<T, bool>>* = nullptr>
+  __host__ __device__ __forceinline__ bool operator()(void* out_buffer,
                                                       size_t row,
                                                       const data_type,
-                                                      char const *begin,
-                                                      char const *end,
-                                                      parse_options_view const &opts,
+                                                      char const* begin,
+                                                      char const* end,
+                                                      parse_options_view const& opts,
                                                       column_parse::flags flags)
   {
-    static_cast<T *>(out_buffer)[row] = [&opts, begin, end]() {
+    static_cast<T*>(out_buffer)[row] = [&opts, begin, end]() {
       // Check for user-specified true/false values
       auto const field_len = static_cast<size_t>(end - begin);
       if (serialized_trie_contains(opts.trie_true, {begin, field_len})) { return true; }
@@ -505,17 +505,17 @@ struct decode_op {
    * @brief Dispatch for floating points, which are set to NaN if the input
    * is not valid. In such case, the validity mask is set to zero too.
    */
-  template <typename T, typename std::enable_if_t<std::is_floating_point_v<T>> * = nullptr>
-  __host__ __device__ __forceinline__ bool operator()(void *out_buffer,
+  template <typename T, typename std::enable_if_t<std::is_floating_point_v<T>>* = nullptr>
+  __host__ __device__ __forceinline__ bool operator()(void* out_buffer,
                                                       size_t row,
                                                       const data_type,
-                                                      char const *begin,
-                                                      char const *end,
-                                                      parse_options_view const &opts,
+                                                      char const* begin,
+                                                      char const* end,
+                                                      parse_options_view const& opts,
                                                       column_parse::flags flags)
   {
-    T const value                     = decode_value<T>(begin, end, opts);
-    static_cast<T *>(out_buffer)[row] = value;
+    T const value                    = decode_value<T>(begin, end, opts);
+    static_cast<T*>(out_buffer)[row] = value;
 
     return !std::isnan(value);
   }
@@ -525,16 +525,16 @@ struct decode_op {
    */
   template <typename T,
             typename std::enable_if_t<!std::is_integral_v<T> and !std::is_floating_point_v<T> and
-                                      !cudf::is_fixed_point<T>()> * = nullptr>
-  __host__ __device__ __forceinline__ bool operator()(void *out_buffer,
+                                      !cudf::is_fixed_point<T>()>* = nullptr>
+  __host__ __device__ __forceinline__ bool operator()(void* out_buffer,
                                                       size_t row,
                                                       const data_type,
-                                                      char const *begin,
-                                                      char const *end,
-                                                      parse_options_view const &opts,
+                                                      char const* begin,
+                                                      char const* end,
+                                                      parse_options_view const& opts,
                                                       column_parse::flags flags)
   {
-    static_cast<T *>(out_buffer)[row] = decode_value<T>(begin, end, opts);
+    static_cast<T*>(out_buffer)[row] = decode_value<T>(begin, end, opts);
 
     return true;
   }
@@ -559,8 +559,8 @@ __global__ void __launch_bounds__(csvparse_block_dim)
                       device_span<column_parse::flags const> column_flags,
                       device_span<uint64_t const> row_offsets,
                       device_span<cudf::data_type const> dtypes,
-                      device_span<void *const> columns,
-                      device_span<cudf::bitmask_type *const> valids)
+                      device_span<void* const> columns,
+                      device_span<cudf::bitmask_type* const> valids)
 {
   auto const raw_csv = data.data();
   // thread IDs range per block, so also need the block id.
@@ -605,7 +605,7 @@ __global__ void __launch_bounds__(csvparse_block_dim)
               --end;
             }
           }
-          auto str_list = static_cast<std::pair<const char *, size_t> *>(columns[actual_col]);
+          auto str_list = static_cast<std::pair<const char*, size_t>*>(columns[actual_col]);
           str_list[rec_id].first  = field_start;
           str_list[rec_id].second = end - field_start;
         } else {
@@ -623,7 +623,7 @@ __global__ void __launch_bounds__(csvparse_block_dim)
           }
         }
       } else if (dtypes[actual_col].id() == cudf::type_id::STRING) {
-        auto str_list = static_cast<std::pair<const char *, size_t> *>(columns[actual_col]);
+        auto str_list           = static_cast<std::pair<const char*, size_t>*>(columns[actual_col]);
         str_list[rec_id].first  = nullptr;
         str_list[rec_id].second = 0;
       }
@@ -680,7 +680,7 @@ constexpr __device__ uint32_t make_char_context(uint32_t id0,
  * The char_ctx value should be created via make_char_context, and its value should
  * have been evaluated at compile-time.
  */
-inline __device__ void merge_char_context(uint4 &ctx, uint32_t char_ctx, uint32_t pos)
+inline __device__ void merge_char_context(uint4& ctx, uint32_t char_ctx, uint32_t pos)
 {
   uint32_t id0 = (ctx.w >> 0) & 3;
   uint32_t id1 = (ctx.w >> 2) & 3;
@@ -709,9 +709,10 @@ inline __device__ packed_rowctx_t pack_rowmaps(uint4 ctx_map)
  */
 inline __device__ uint32_t select_rowmap(uint4 ctx_map, uint32_t ctxid)
 {
-  return (ctxid == ROW_CTX_NONE)
-           ? ctx_map.x
-           : (ctxid == ROW_CTX_QUOTE) ? ctx_map.y : (ctxid == ROW_CTX_COMMENT) ? ctx_map.z : 0;
+  return (ctxid == ROW_CTX_NONE)      ? ctx_map.x
+         : (ctxid == ROW_CTX_QUOTE)   ? ctx_map.y
+         : (ctxid == ROW_CTX_COMMENT) ? ctx_map.z
+                                      : 0;
 }
 
 /**
@@ -731,7 +732,7 @@ inline __device__ uint32_t select_rowmap(uint4 ctx_map, uint32_t ctxid)
  * @param t thread id (leaf node id)
  */
 template <uint32_t lanemask, uint32_t tmask, uint32_t base, uint32_t level_scale>
-inline __device__ void ctx_merge(uint64_t *ctxtree, packed_rowctx_t *ctxb, uint32_t t)
+inline __device__ void ctx_merge(uint64_t* ctxtree, packed_rowctx_t* ctxb, uint32_t t)
 {
   uint64_t tmp = shuffle_xor(*ctxb, lanemask);
   if (!(t & tmask)) {
@@ -754,7 +755,7 @@ inline __device__ void ctx_merge(uint64_t *ctxtree, packed_rowctx_t *ctxb, uint3
  */
 template <uint32_t rmask>
 inline __device__ void ctx_unmerge(
-  uint32_t base, uint64_t *ctxtree, uint32_t *ctx, uint32_t *brow4, uint32_t t)
+  uint32_t base, uint64_t* ctxtree, uint32_t* ctx, uint32_t* brow4, uint32_t t)
 {
   rowctx32_t ctxb_left, ctxb_right, ctxb_sum;
   ctxb_sum   = get_row_context(ctxtree[base], *ctx);
@@ -869,7 +870,7 @@ static inline __device__ rowctx32_t rowctx_inverse_merge_transform(uint64_t ctxt
  * @param commentchar Comment line character (skip rows starting with this character)
  */
 __global__ void __launch_bounds__(rowofs_block_dim)
-  gather_row_offsets_gpu(uint64_t *row_ctx,
+  gather_row_offsets_gpu(uint64_t* row_ctx,
                          device_span<uint64_t> offsets_out,
                          device_span<char const> const data,
                          size_t chunk_size,
@@ -892,11 +893,11 @@ __global__ void __launch_bounds__(rowofs_block_dim)
     __align__(8) uint64_t ctxtree[rowofs_block_dim * 2];
   } temp_storage;
 
-  const char *end = start + (min(parse_pos + chunk_size, data_size) - start_offset);
+  const char* end = start + (min(parse_pos + chunk_size, data_size) - start_offset);
   uint32_t t      = threadIdx.x;
   size_t block_pos =
     (parse_pos - start_offset) + blockIdx.x * static_cast<size_t>(rowofs_block_bytes) + t * 32;
-  const char *cur = start + block_pos;
+  const char* cur = start + block_pos;
 
   // Initial state is neutral context (no state transitions), zero rows
   uint4 ctx_map = {
@@ -934,7 +935,7 @@ __global__ void __launch_bounds__(rowofs_block_dim)
         ctx = make_char_context(ROW_CTX_NONE, ROW_CTX_QUOTE);
       }
     } else {
-      const char *data_end = start + data_size - start_offset;
+      const char* data_end = start + data_size - start_offset;
       if (cur <= end && cur == data_end) {
         // Add a newline at data end (need the extra row offset to infer length of previous row)
         ctx = make_char_context(ROW_CTX_EOF, ROW_CTX_EOF, ROW_CTX_EOF, 1, 1, 1);
@@ -993,7 +994,7 @@ __global__ void __launch_bounds__(rowofs_block_dim)
   }
 }
 
-size_t __host__ count_blank_rows(const cudf::io::parse_options_view &opts,
+size_t __host__ count_blank_rows(const cudf::io::parse_options_view& opts,
                                  device_span<char const> data,
                                  device_span<uint64_t const> row_offsets,
                                  rmm::cuda_stream_view stream)
@@ -1011,7 +1012,7 @@ size_t __host__ count_blank_rows(const cudf::io::parse_options_view &opts,
     });
 }
 
-device_span<uint64_t> __host__ remove_blank_rows(cudf::io::parse_options_view const &options,
+device_span<uint64_t> __host__ remove_blank_rows(cudf::io::parse_options_view const& options,
                                                  device_span<char const> data,
                                                  device_span<uint64_t> row_offsets,
                                                  rmm::cuda_stream_view stream)
@@ -1032,7 +1033,7 @@ device_span<uint64_t> __host__ remove_blank_rows(cudf::io::parse_options_view co
 }
 
 std::vector<column_type_histogram> detect_column_types(
-  cudf::io::parse_options_view const &options,
+  cudf::io::parse_options_view const& options,
   device_span<char const> const data,
   device_span<column_parse::flags const> const column_flags,
   device_span<uint64_t const> const row_starts,
@@ -1052,13 +1053,13 @@ std::vector<column_type_histogram> detect_column_types(
   return detail::make_std_vector_sync(d_stats, stream);
 }
 
-void __host__ decode_row_column_data(cudf::io::parse_options_view const &options,
+void __host__ decode_row_column_data(cudf::io::parse_options_view const& options,
                                      device_span<char const> data,
                                      device_span<column_parse::flags const> column_flags,
                                      device_span<uint64_t const> row_offsets,
                                      device_span<cudf::data_type const> dtypes,
-                                     device_span<void *const> columns,
-                                     device_span<cudf::bitmask_type *const> valids,
+                                     device_span<void* const> columns,
+                                     device_span<cudf::bitmask_type* const> valids,
                                      rmm::cuda_stream_view stream)
 {
   // Calculate actual block count to use based on records count
@@ -1070,8 +1071,8 @@ void __host__ decode_row_column_data(cudf::io::parse_options_view const &options
     options, data, column_flags, row_offsets, dtypes, columns, valids);
 }
 
-uint32_t __host__ gather_row_offsets(const parse_options_view &options,
-                                     uint64_t *row_ctx,
+uint32_t __host__ gather_row_offsets(const parse_options_view& options,
+                                     uint64_t* row_ctx,
                                      device_span<uint64_t> const offsets_out,
                                      device_span<char const> const data,
                                      size_t chunk_size,
diff --git a/cpp/src/io/csv/csv_gpu.h b/cpp/src/io/csv/csv_gpu.h
index 838abe66b94..9b83028fa92 100644
--- a/cpp/src/io/csv/csv_gpu.h
+++ b/cpp/src/io/csv/csv_gpu.h
@@ -149,8 +149,8 @@ inline __host__ __device__ rowctx64_t select_row_context(rowctx64_t sel_ctx,
  *
  * @return Number of row contexts
  */
-uint32_t gather_row_offsets(cudf::io::parse_options_view const &options,
-                            uint64_t *row_ctx,
+uint32_t gather_row_offsets(cudf::io::parse_options_view const& options,
+                            uint64_t* row_ctx,
                             device_span<uint64_t> offsets_out,
                             device_span<char const> data,
                             size_t chunk_size,
@@ -170,7 +170,7 @@ uint32_t gather_row_offsets(cudf::io::parse_options_view const &options,
  * @param row_offsets Row offsets in the character data buffer
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-size_t count_blank_rows(cudf::io::parse_options_view const &options,
+size_t count_blank_rows(cudf::io::parse_options_view const& options,
                         device_span<char const> data,
                         device_span<uint64_t const> row_offsets,
                         rmm::cuda_stream_view stream);
@@ -183,7 +183,7 @@ size_t count_blank_rows(cudf::io::parse_options_view const &options,
  * @param row_offsets Row offsets in the character data buffer
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-device_span<uint64_t> remove_blank_rows(const cudf::io::parse_options_view &options,
+device_span<uint64_t> remove_blank_rows(const cudf::io::parse_options_view& options,
                                         device_span<char const> data,
                                         device_span<uint64_t> row_offsets,
                                         rmm::cuda_stream_view stream);
@@ -200,7 +200,7 @@ device_span<uint64_t> remove_blank_rows(const cudf::io::parse_options_view &opti
  * @return stats Histogram of each dtypes' occurrence for each column
  */
 std::vector<column_type_histogram> detect_column_types(
-  cudf::io::parse_options_view const &options,
+  cudf::io::parse_options_view const& options,
   device_span<char const> data,
   device_span<column_parse::flags const> column_flags,
   device_span<uint64_t const> row_offsets,
@@ -219,13 +219,13 @@ std::vector<column_type_histogram> detect_column_types(
  * @param[out] valids Device memory output of column valids bitmap data
  * @param[in] stream CUDA stream to use, default 0
  */
-void decode_row_column_data(cudf::io::parse_options_view const &options,
+void decode_row_column_data(cudf::io::parse_options_view const& options,
                             device_span<char const> data,
                             device_span<column_parse::flags const> column_flags,
                             device_span<uint64_t const> row_offsets,
                             device_span<cudf::data_type const> dtypes,
-                            device_span<void *const> columns,
-                            device_span<cudf::bitmask_type *const> valids,
+                            device_span<void* const> columns,
+                            device_span<cudf::bitmask_type* const> valids,
                             rmm::cuda_stream_view stream);
 
 }  // namespace gpu
diff --git a/cpp/src/io/csv/datetime.cuh b/cpp/src/io/csv/datetime.cuh
index 4e4ddd09a9f..7160041ff4e 100644
--- a/cpp/src/io/csv/datetime.cuh
+++ b/cpp/src/io/csv/datetime.cuh
@@ -232,7 +232,9 @@ __inline__ __device__ void extract_time(
   if (*last == 'M' || *last == 'm') {
     if (*(last - 1) == 'P' || *(last - 1) == 'p') { hour_adjust = 12; }
     last = last - 2;
-    while (*last == ' ') { --last; }
+    while (*last == ' ') {
+      --last;
+    }
   }
   end = last + 1;
 
diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu
index 71391c8c444..70ce0fce1cc 100644
--- a/cpp/src/io/csv/reader_impl.cu
+++ b/cpp/src/io/csv/reader_impl.cu
@@ -40,6 +40,7 @@
 #include <numeric>
 #include <tuple>
 #include <unordered_map>
+#include <unordered_set>
 
 using std::string;
 using std::vector;
@@ -57,7 +58,7 @@ struct VisitorOverload : Ts... {
   using Ts::operator()...;
 };
 template <class... Ts>
-VisitorOverload(Ts...)->VisitorOverload<Ts...>;
+VisitorOverload(Ts...) -> VisitorOverload<Ts...>;
 }  // namespace
 
 namespace cudf {
@@ -102,7 +103,7 @@ constexpr size_t calculateMaxRowSize(int num_columns = 0) noexcept
  *
  * @return Tuple of data_type and flags
  */
-std::tuple<data_type, column_parse::flags> get_dtype_info(const std::string &dtype)
+std::tuple<data_type, column_parse::flags> get_dtype_info(const std::string& dtype)
 {
   if (dtype == "hex" || dtype == "hex64") {
     return std::make_tuple(data_type{cudf::type_id::INT64}, column_parse::as_hexadecimal);
@@ -132,8 +133,8 @@ string removeQuotes(string str, char quotechar)
  * @brief Parse the first row to set the column names in the raw_csv parameter.
  * The first row can be either the header row, or the first data row
  */
-std::vector<std::string> setColumnNames(std::vector<char> const &header,
-                                        parse_options_view const &opts,
+std::vector<std::string> setColumnNames(std::vector<char> const& header,
+                                        parse_options_view const& opts,
                                         int header_row,
                                         std::string prefix)
 {
@@ -196,7 +197,7 @@ std::vector<std::string> setColumnNames(std::vector<char> const &header,
 }
 
 template <typename C>
-void erase_except_last(C &container, rmm::cuda_stream_view stream)
+void erase_except_last(C& container, rmm::cuda_stream_view stream)
 {
   cudf::detail::device_single_thread(
     [span = device_span<typename C::value_type>{container}] __device__() mutable {
@@ -222,7 +223,7 @@ reader::impl::select_data_and_row_offsets(rmm::cuda_stream_view stream)
   size_t map_range_size = 0;
   if (range_size != 0) {
     auto num_given_dtypes =
-      std::visit([](const auto &dtypes) { return dtypes.size(); }, opts_.get_dtypes());
+      std::visit([](const auto& dtypes) { return dtypes.size(); }, opts_.get_dtypes());
     const auto num_columns = std::max(opts_.get_names().size(), num_given_dtypes);
     map_range_size         = range_size + calculateMaxRowSize(num_columns);
   }
@@ -240,7 +241,7 @@ reader::impl::select_data_and_row_offsets(rmm::cuda_stream_view stream)
     auto buffer    = source_->host_read(range_offset, data_size);
 
     auto h_data = host_span<char const>(  //
-      reinterpret_cast<const char *>(buffer->data()),
+      reinterpret_cast<const char*>(buffer->data()),
       buffer->size());
 
     std::vector<char> h_uncomp_data_owner;
@@ -269,7 +270,7 @@ reader::impl::select_data_and_row_offsets(rmm::cuda_stream_view stream)
                                        num_rows,
                                        load_whole_file,
                                        stream);
-    auto &row_offsets = data_row_offsets.second;
+    auto& row_offsets = data_row_offsets.second;
     // Exclude the rows that are to be skipped from the end
     if (skip_end_rows > 0 && static_cast<size_t>(skip_end_rows) < row_offsets.size()) {
       row_offsets.shrink(row_offsets.size() - skip_end_rows);
@@ -282,8 +283,8 @@ reader::impl::select_data_and_row_offsets(rmm::cuda_stream_view stream)
 table_with_metadata reader::impl::read(rmm::cuda_stream_view stream)
 {
   auto const data_row_offsets = select_data_and_row_offsets(stream);
-  auto const &data            = data_row_offsets.first;
-  auto const &row_offsets     = data_row_offsets.second;
+  auto const& data            = data_row_offsets.first;
+  auto const& row_offsets     = data_row_offsets.second;
 
   // Exclude the end-of-data row from number of rows with actual data
   num_records_ = std::max(row_offsets.size(), 1ul) - 1;
@@ -308,14 +309,16 @@ table_with_metadata reader::impl::read(rmm::cuda_stream_view stream)
 
     // Looking for duplicates
     std::unordered_map<string, int> col_names_histogram;
-    for (auto &col_name : col_names_) {
+    for (auto& col_name : col_names_) {
       // Operator [] inserts a default-initialized value if the given key is not
       // present
       if (++col_names_histogram[col_name] > 1) {
         if (opts_.is_enabled_mangle_dupe_cols()) {
           // Rename duplicates of column X as X.1, X.2, ...; First appearance
           // stays as X
-          col_name += "." + std::to_string(col_names_histogram[col_name] - 1);
+          do {
+            col_name += "." + std::to_string(col_names_histogram[col_name] - 1);
+          } while (col_names_histogram[col_name]++);
         } else {
           // All duplicate columns will be ignored; First appearance is parsed
           const auto idx     = &col_name - col_names_.data();
@@ -336,13 +339,18 @@ table_with_metadata reader::impl::read(rmm::cuda_stream_view stream)
     for (const auto index : opts_.get_use_cols_indexes()) {
       column_flags_[index] = column_parse::enabled;
     }
-    num_active_cols_ = opts_.get_use_cols_indexes().size();
+    num_active_cols_ = std::unordered_set<int>(opts_.get_use_cols_indexes().begin(),
+                                               opts_.get_use_cols_indexes().end())
+                         .size();
 
-    for (const auto &name : opts_.get_use_cols_names()) {
+    for (const auto& name : opts_.get_use_cols_names()) {
       const auto it = std::find(col_names_.begin(), col_names_.end(), name);
       if (it != col_names_.end()) {
-        column_flags_[it - col_names_.begin()] = column_parse::enabled;
-        num_active_cols_++;
+        auto curr_it = it - col_names_.begin();
+        if (column_flags_[curr_it] == column_parse::disabled) {
+          column_flags_[curr_it] = column_parse::enabled;
+          num_active_cols_++;
+        }
       }
     }
   }
@@ -353,7 +361,7 @@ table_with_metadata reader::impl::read(rmm::cuda_stream_view stream)
       column_flags_[index] |= column_parse::as_datetime;
     }
 
-    for (const auto &name : opts_.get_infer_date_names()) {
+    for (const auto& name : opts_.get_infer_date_names()) {
       auto it = std::find(col_names_.begin(), col_names_.end(), name);
       if (it != col_names_.end()) {
         column_flags_[it - col_names_.begin()] |= column_parse::as_datetime;
@@ -368,7 +376,7 @@ table_with_metadata reader::impl::read(rmm::cuda_stream_view stream)
   auto out_columns = std::vector<std::unique_ptr<cudf::column>>();
 
   bool has_to_infer_column_types =
-    std::visit([](const auto &dtypes) { return dtypes.empty(); }, opts_.get_dtypes());
+    std::visit([](const auto& dtypes) { return dtypes.empty(); }, opts_.get_dtypes());
 
   std::vector<data_type> column_types;
   if (has_to_infer_column_types) {
@@ -376,8 +384,8 @@ table_with_metadata reader::impl::read(rmm::cuda_stream_view stream)
   } else {
     column_types =
       std::visit(VisitorOverload{
-                   [&](const std::vector<data_type> &data_types) { return data_types; },
-                   [&](const std::vector<string> &dtypes) { return parse_column_types(dtypes); }},
+                   [&](const std::vector<data_type>& data_types) { return data_types; },
+                   [&](const std::vector<string>& dtypes) { return parse_column_types(dtypes); }},
                  opts_.get_dtypes());
   }
 
@@ -422,7 +430,9 @@ size_t reader::impl::find_first_row_start(host_span<char const> data)
   // For now, look for the first terminator (assume the first terminator isn't within a quote)
   // TODO: Attempt to infer this from the data
   size_t pos = 0;
-  while (pos < data.size() && data[pos] != opts.terminator) { ++pos; }
+  while (pos < data.size() && data[pos] != opts.terminator) {
+    ++pos;
+  }
   return std::min(pos + 1, data.size());
 }
 
@@ -529,7 +539,9 @@ reader::impl::load_data_and_gather_row_offsets(host_span<char const> data,
         stream.synchronize();
 
         size_t rows_out_of_range = 0;
-        for (uint32_t i = 0; i < num_blocks; i++) { rows_out_of_range += row_ctx[i]; }
+        for (uint32_t i = 0; i < num_blocks; i++) {
+          rows_out_of_range += row_ctx[i];
+        }
         if (rows_out_of_range != 0) {
           // Keep one row out of range (used to infer length of previous row)
           auto new_row_offsets_size =
@@ -641,7 +653,7 @@ std::vector<data_type> reader::impl::infer_column_types(device_span<char const>
   }
 
   if (opts_.get_timestamp_type().id() != cudf::type_id::EMPTY) {
-    for (auto &type : dtypes) {
+    for (auto& type : dtypes) {
       if (cudf::is_timestamp(type)) { type = opts_.get_timestamp_type(); }
     }
   }
@@ -655,13 +667,13 @@ std::vector<data_type> reader::impl::infer_column_types(device_span<char const>
 }
 
 std::vector<data_type> reader::impl::parse_column_types(
-  const std::vector<std::string> &types_as_strings)
+  const std::vector<std::string>& types_as_strings)
 {
   std::vector<data_type> dtypes;
 
   const bool is_dict = std::all_of(types_as_strings.begin(),
                                    types_as_strings.end(),
-                                   [](const auto &s) { return s.find(':') != std::string::npos; });
+                                   [](const auto& s) { return s.find(':') != std::string::npos; });
 
   if (!is_dict) {
     if (types_as_strings.size() == 1) {
@@ -670,7 +682,9 @@ std::vector<data_type> reader::impl::parse_column_types(
       column_parse::flags col_flags_;
       std::tie(dtype_, col_flags_) = get_dtype_info(types_as_strings[0]);
       dtypes.resize(num_active_cols_, dtype_);
-      for (int col = 0; col < num_actual_cols_; col++) { column_flags_[col] |= col_flags_; }
+      for (int col = 0; col < num_actual_cols_; col++) {
+        column_flags_[col] |= col_flags_;
+      }
       CUDF_EXPECTS(dtypes.back().id() != cudf::type_id::EMPTY, "Unsupported data type");
     } else {
       // If it's a list, assign dtypes to active columns in the given order
@@ -692,7 +706,7 @@ std::vector<data_type> reader::impl::parse_column_types(
     // Translate vector of `name : dtype` strings to map
     // NOTE: Incoming pairs can be out-of-order from column names in dataset
     std::unordered_map<std::string, std::string> col_type_map;
-    for (const auto &pair : types_as_strings) {
+    for (const auto& pair : types_as_strings) {
       const auto pos     = pair.find_last_of(':');
       const auto name    = pair.substr(0, pos);
       const auto dtype   = pair.substr(pos + 1, pair.size());
@@ -714,7 +728,7 @@ std::vector<data_type> reader::impl::parse_column_types(
   }
 
   if (opts_.get_timestamp_type().id() != cudf::type_id::EMPTY) {
-    for (auto &type : dtypes) {
+    for (auto& type : dtypes) {
       if (cudf::is_timestamp(type)) { type = opts_.get_timestamp_type(); }
     }
   }
@@ -753,8 +767,8 @@ std::vector<column_buffer> reader::impl::decode_data(device_span<char const> dat
     }
   }
 
-  thrust::host_vector<void *> h_data(num_active_cols_);
-  thrust::host_vector<bitmask_type *> h_valid(num_active_cols_);
+  thrust::host_vector<void*> h_data(num_active_cols_);
+  thrust::host_vector<bitmask_type*> h_valid(num_active_cols_);
 
   for (int i = 0; i < num_active_cols_; ++i) {
     h_data[i]  = out_buffers[i].data();
@@ -777,7 +791,7 @@ std::vector<column_buffer> reader::impl::decode_data(device_span<char const> dat
  * @brief Create a serialized trie for N/A value matching, based on the options.
  */
 cudf::detail::trie create_na_trie(char quotechar,
-                                  csv_reader_options const &reader_opts,
+                                  csv_reader_options const& reader_opts,
                                   rmm::cuda_stream_view stream)
 {
   // Default values to recognize as null values
@@ -815,7 +829,7 @@ cudf::detail::trie create_na_trie(char quotechar,
   return cudf::detail::create_serialized_trie(na_values, stream);
 }
 
-parse_options make_parse_options(csv_reader_options const &reader_opts,
+parse_options make_parse_options(csv_reader_options const& reader_opts,
                                  rmm::cuda_stream_view stream)
 {
   auto parse_opts = parse_options{};
@@ -873,9 +887,9 @@ parse_options make_parse_options(csv_reader_options const &reader_opts,
 
 reader::impl::impl(std::unique_ptr<datasource> source,
                    std::string filepath,
-                   csv_reader_options const &options,
+                   csv_reader_options const& options,
                    rmm::cuda_stream_view stream,
-                   rmm::mr::device_memory_resource *mr)
+                   rmm::mr::device_memory_resource* mr)
   : mr_(mr), source_(std::move(source)), filepath_(filepath), opts_(options)
 {
   num_actual_cols_ = opts_.get_names().size();
@@ -890,10 +904,10 @@ reader::impl::impl(std::unique_ptr<datasource> source,
 }
 
 // Forward to implementation
-reader::reader(std::vector<std::string> const &filepaths,
-               csv_reader_options const &options,
+reader::reader(std::vector<std::string> const& filepaths,
+               csv_reader_options const& options,
                rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource *mr)
+               rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(filepaths.size() == 1, "Only a single source is currently supported.");
   // Delay actual instantiation of data source until read to allow for
@@ -902,10 +916,10 @@ reader::reader(std::vector<std::string> const &filepaths,
 }
 
 // Forward to implementation
-reader::reader(std::vector<std::unique_ptr<cudf::io::datasource>> &&sources,
-               csv_reader_options const &options,
+reader::reader(std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
+               csv_reader_options const& options,
                rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource *mr)
+               rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(sources.size() == 1, "Only a single source is currently supported.");
   _impl = std::make_unique<impl>(std::move(sources[0]), "", options, stream, mr);
diff --git a/cpp/src/io/csv/reader_impl.hpp b/cpp/src/io/csv/reader_impl.hpp
index 17f27a28e30..29c6b48bc8a 100644
--- a/cpp/src/io/csv/reader_impl.hpp
+++ b/cpp/src/io/csv/reader_impl.hpp
@@ -79,9 +79,9 @@ class reader::impl {
    */
   explicit impl(std::unique_ptr<datasource> source,
                 std::string filepath,
-                csv_reader_options const &options,
+                csv_reader_options const& options,
                 rmm::cuda_stream_view stream,
-                rmm::mr::device_memory_resource *mr);
+                rmm::mr::device_memory_resource* mr);
 
   /**
    * @brief Read an entire set or a subset of data and returns a set of columns.
@@ -104,7 +104,7 @@ class reader::impl {
     device_span<uint64_t const> selected;
 
    public:
-    selected_rows_offsets(rmm::device_uvector<uint64_t> &&data,
+    selected_rows_offsets(rmm::device_uvector<uint64_t>&& data,
                           device_span<uint64_t const> selected_span)
       : all{std::move(data)}, selected{selected_span}
     {
@@ -188,7 +188,7 @@ class reader::impl {
    * types
    * @return List of columns' data types
    */
-  std::vector<data_type> parse_column_types(std::vector<std::string> const &types_as_strings);
+  std::vector<data_type> parse_column_types(std::vector<std::string> const& types_as_strings);
 
   /**
    * @brief Converts the row-column data and outputs to column bufferrs.
@@ -204,7 +204,7 @@ class reader::impl {
                                          rmm::cuda_stream_view stream);
 
  private:
-  rmm::mr::device_memory_resource *mr_ = nullptr;
+  rmm::mr::device_memory_resource* mr_ = nullptr;
   std::unique_ptr<datasource> source_;
   std::string filepath_;
   std::string compression_type_;
diff --git a/cpp/src/io/json/json_gpu.cu b/cpp/src/io/json/json_gpu.cu
index 2bc7969d5e5..ba6bc30e0d4 100644
--- a/cpp/src/io/json/json_gpu.cu
+++ b/cpp/src/io/json/json_gpu.cu
@@ -57,8 +57,8 @@ namespace {
  * @param[in] begin Pointer to the first character in the row
  * @param[in] end pointer to the first character after the row
  */
-__device__ std::pair<char const *, char const *> limit_range_to_brackets(char const *begin,
-                                                                         char const *end)
+__device__ std::pair<char const*, char const*> limit_range_to_brackets(char const* begin,
+                                                                       char const* end)
 {
   auto const data_begin = thrust::next(thrust::find_if(
     thrust::seq, begin, end, [] __device__(auto c) { return c == '[' || c == '{'; }));
@@ -81,9 +81,9 @@ __device__ std::pair<char const *, char const *> limit_range_to_brackets(char co
  *
  * @return Begin and end iterators of the key name; (`end`, `end`) if a key is not found
  */
-__device__ std::pair<char const *, char const *> get_next_key(char const *begin,
-                                                              char const *end,
-                                                              char quotechar)
+__device__ std::pair<char const*, char const*> get_next_key(char const* begin,
+                                                            char const* end,
+                                                            char quotechar)
 {
   // Key starts after the first quote
   auto const key_begin = thrust::find(thrust::seq, begin, end, quotechar) + 1;
@@ -109,9 +109,9 @@ __device__ std::pair<char const *, char const *> get_next_key(char const *begin,
  * @return The parsed numeric value
  */
 template <typename T, int base>
-__inline__ __device__ T decode_value(const char *begin,
+__inline__ __device__ T decode_value(const char* begin,
                                      uint64_t end,
-                                     parse_options_view const &opts)
+                                     parse_options_view const& opts)
 {
   return cudf::io::parse_numeric<T, base>(begin, end, opts);
 }
@@ -126,9 +126,9 @@ __inline__ __device__ T decode_value(const char *begin,
  * @return The parsed numeric value
  */
 template <typename T>
-__inline__ __device__ T decode_value(const char *begin,
-                                     const char *end,
-                                     parse_options_view const &opts)
+__inline__ __device__ T decode_value(const char* begin,
+                                     const char* end,
+                                     parse_options_view const& opts)
 {
   return cudf::io::parse_numeric<T>(begin, end, opts);
 }
@@ -143,9 +143,9 @@ __inline__ __device__ T decode_value(const char *begin,
  * @return The parsed timestamp_D
  */
 template <>
-__inline__ __device__ cudf::timestamp_D decode_value(const char *begin,
-                                                     const char *end,
-                                                     parse_options_view const &opts)
+__inline__ __device__ cudf::timestamp_D decode_value(const char* begin,
+                                                     const char* end,
+                                                     parse_options_view const& opts)
 {
   return cudf::timestamp_D{cudf::duration_D{to_date(begin, end, opts.dayfirst)}};
 }
@@ -160,9 +160,9 @@ __inline__ __device__ cudf::timestamp_D decode_value(const char *begin,
  * @return The parsed timestamp_s
  */
 template <>
-__inline__ __device__ cudf::timestamp_s decode_value(const char *begin,
-                                                     const char *end,
-                                                     parse_options_view const &opts)
+__inline__ __device__ cudf::timestamp_s decode_value(const char* begin,
+                                                     const char* end,
+                                                     parse_options_view const& opts)
 {
   auto milli = to_date_time(begin, end, opts.dayfirst);
   return cudf::timestamp_s{cudf::duration_s{milli / 1000}};
@@ -178,9 +178,9 @@ __inline__ __device__ cudf::timestamp_s decode_value(const char *begin,
  * @return The parsed timestamp_ms
  */
 template <>
-__inline__ __device__ cudf::timestamp_ms decode_value(const char *begin,
-                                                      const char *end,
-                                                      parse_options_view const &opts)
+__inline__ __device__ cudf::timestamp_ms decode_value(const char* begin,
+                                                      const char* end,
+                                                      parse_options_view const& opts)
 {
   auto milli = to_date_time(begin, end, opts.dayfirst);
   return cudf::timestamp_ms{cudf::duration_ms{milli}};
@@ -196,9 +196,9 @@ __inline__ __device__ cudf::timestamp_ms decode_value(const char *begin,
  * @return The parsed timestamp_us
  */
 template <>
-__inline__ __device__ cudf::timestamp_us decode_value(const char *begin,
-                                                      const char *end,
-                                                      parse_options_view const &opts)
+__inline__ __device__ cudf::timestamp_us decode_value(const char* begin,
+                                                      const char* end,
+                                                      parse_options_view const& opts)
 {
   auto milli = to_date_time(begin, end, opts.dayfirst);
   return cudf::timestamp_us{cudf::duration_us{milli * 1000}};
@@ -214,21 +214,21 @@ __inline__ __device__ cudf::timestamp_us decode_value(const char *begin,
  * @return The parsed timestamp_ns
  */
 template <>
-__inline__ __device__ cudf::timestamp_ns decode_value(const char *begin,
-                                                      const char *end,
-                                                      parse_options_view const &opts)
+__inline__ __device__ cudf::timestamp_ns decode_value(const char* begin,
+                                                      const char* end,
+                                                      parse_options_view const& opts)
 {
   auto milli = to_date_time(begin, end, opts.dayfirst);
   return cudf::timestamp_ns{cudf::duration_ns{milli * 1000000}};
 }
 
 #ifndef DURATION_DECODE_VALUE
-#define DURATION_DECODE_VALUE(Type)                                 \
-  template <>                                                       \
-  __inline__ __device__ Type decode_value(                          \
-    const char *begin, const char *end, parse_options_view const &) \
-  {                                                                 \
-    return Type{to_time_delta<Type>(begin, end)};                   \
+#define DURATION_DECODE_VALUE(Type)                                \
+  template <>                                                      \
+  __inline__ __device__ Type decode_value(                         \
+    const char* begin, const char* end, parse_options_view const&) \
+  {                                                                \
+    return Type{to_time_delta<Type>(begin, end)};                  \
   }
 #endif
 DURATION_DECODE_VALUE(duration_D)
@@ -239,48 +239,48 @@ DURATION_DECODE_VALUE(duration_ns)
 
 // The purpose of these is merely to allow compilation ONLY
 template <>
-__inline__ __device__ cudf::string_view decode_value(const char *,
-                                                     const char *,
-                                                     parse_options_view const &)
+__inline__ __device__ cudf::string_view decode_value(const char*,
+                                                     const char*,
+                                                     parse_options_view const&)
 {
   return cudf::string_view{};
 }
 
 template <>
-__inline__ __device__ cudf::dictionary32 decode_value(const char *,
-                                                      const char *,
-                                                      parse_options_view const &)
+__inline__ __device__ cudf::dictionary32 decode_value(const char*,
+                                                      const char*,
+                                                      parse_options_view const&)
 {
   return cudf::dictionary32{};
 }
 
 template <>
-__inline__ __device__ cudf::list_view decode_value(const char *,
-                                                   const char *,
-                                                   parse_options_view const &)
+__inline__ __device__ cudf::list_view decode_value(const char*,
+                                                   const char*,
+                                                   parse_options_view const&)
 {
   return cudf::list_view{};
 }
 template <>
-__inline__ __device__ cudf::struct_view decode_value(const char *,
-                                                     const char *,
-                                                     parse_options_view const &)
+__inline__ __device__ cudf::struct_view decode_value(const char*,
+                                                     const char*,
+                                                     parse_options_view const&)
 {
   return cudf::struct_view{};
 }
 
 template <>
-__inline__ __device__ numeric::decimal32 decode_value(const char *,
-                                                      const char *,
-                                                      parse_options_view const &)
+__inline__ __device__ numeric::decimal32 decode_value(const char*,
+                                                      const char*,
+                                                      parse_options_view const&)
 {
   return numeric::decimal32{};
 }
 
 template <>
-__inline__ __device__ numeric::decimal64 decode_value(const char *,
-                                                      const char *,
-                                                      parse_options_view const &)
+__inline__ __device__ numeric::decimal64 decode_value(const char*,
+                                                      const char*,
+                                                      parse_options_view const&)
 {
   return numeric::decimal64{};
 }
@@ -297,14 +297,14 @@ struct ConvertFunctor {
    * It is handled here rather than within convertStrToValue() as that function
    * is used by other types (ex. timestamp) that aren't 'booleable'.
    */
-  template <typename T, typename std::enable_if_t<std::is_integral<T>::value> * = nullptr>
-  __host__ __device__ __forceinline__ bool operator()(char const *begin,
-                                                      char const *end,
-                                                      void *output_column,
+  template <typename T, typename std::enable_if_t<std::is_integral<T>::value>* = nullptr>
+  __host__ __device__ __forceinline__ bool operator()(char const* begin,
+                                                      char const* end,
+                                                      void* output_column,
                                                       cudf::size_type row,
-                                                      const parse_options_view &opts)
+                                                      const parse_options_view& opts)
   {
-    T &value{static_cast<T *>(output_column)[row]};
+    T& value{static_cast<T*>(output_column)[row]};
 
     value = [&opts, end, begin]() -> T {
       // Check for user-specified true/false values
@@ -321,15 +321,15 @@ struct ConvertFunctor {
    * @brief Dispatch for floating points, which are set to NaN if the input
    * is not valid. In such case, the validity mask is set to zero too.
    */
-  template <typename T, typename std::enable_if_t<std::is_floating_point<T>::value> * = nullptr>
-  __host__ __device__ __forceinline__ bool operator()(char const *begin,
-                                                      char const *end,
-                                                      void *out_buffer,
+  template <typename T, typename std::enable_if_t<std::is_floating_point<T>::value>* = nullptr>
+  __host__ __device__ __forceinline__ bool operator()(char const* begin,
+                                                      char const* end,
+                                                      void* out_buffer,
                                                       size_t row,
-                                                      parse_options_view const &opts)
+                                                      parse_options_view const& opts)
   {
-    T const value                     = decode_value<T>(begin, end, opts);
-    static_cast<T *>(out_buffer)[row] = value;
+    T const value                    = decode_value<T>(begin, end, opts);
+    static_cast<T*>(out_buffer)[row] = value;
 
     return !std::isnan(value);
   }
@@ -340,14 +340,14 @@ struct ConvertFunctor {
    */
   template <typename T,
             typename std::enable_if_t<!std::is_floating_point<T>::value and
-                                      !std::is_integral<T>::value> * = nullptr>
-  __host__ __device__ __forceinline__ bool operator()(char const *begin,
-                                                      char const *end,
-                                                      void *output_column,
+                                      !std::is_integral<T>::value>* = nullptr>
+  __host__ __device__ __forceinline__ bool operator()(char const* begin,
+                                                      char const* end,
+                                                      void* output_column,
                                                       cudf::size_type row,
-                                                      const parse_options_view &opts)
+                                                      const parse_options_view& opts)
   {
-    static_cast<T *>(output_column)[row] = decode_value<T>(begin, end, opts);
+    static_cast<T*>(output_column)[row] = decode_value<T>(begin, end, opts);
 
     return true;
   }
@@ -405,8 +405,8 @@ __device__ __inline__ bool is_like_float(
  */
 struct field_descriptor {
   cudf::size_type column;
-  char const *value_begin;
-  char const *value_end;
+  char const* value_begin;
+  char const* value_end;
 };
 
 /**
@@ -416,15 +416,15 @@ struct field_descriptor {
  * @param[in] end pointer to the first character after the parsing range
  * @param[in] opts The global parsing behavior options
  * @param[in] field_idx Index of the current field in the input row
- * @param[in] col_map Pointer to the (column name hash -> solumn index) map in device memory.
+ * @param[in] col_map Pointer to the (column name hash -> column index) map in device memory.
  * nullptr is passed when the input file does not consist of objects.
  * @return Descriptor of the parsed field
  */
-__device__ field_descriptor next_field_descriptor(const char *begin,
-                                                  const char *end,
-                                                  parse_options_view const &opts,
+__device__ field_descriptor next_field_descriptor(const char* begin,
+                                                  const char* end,
+                                                  parse_options_view const& opts,
                                                   cudf::size_type field_idx,
-                                                  col_map_type *col_map)
+                                                  col_map_type* col_map)
 {
   auto const desc_pre_trim =
     col_map == nullptr
@@ -463,7 +463,7 @@ __device__ field_descriptor next_field_descriptor(const char *begin,
  *
  * @return The begin and end iterators of the row data.
  */
-__device__ std::pair<char const *, char const *> get_row_data_range(
+__device__ std::pair<char const*, char const*> get_row_data_range(
   device_span<char const> const data, device_span<uint64_t const> const row_offsets, size_type row)
 {
   auto const row_begin = data.begin() + row_offsets[row];
@@ -481,7 +481,7 @@ __device__ std::pair<char const *, char const *> get_row_data_range(
  * @param[in] data The entire data to read
  * @param[in] row_offsets The offset of each row in the input
  * @param[in] column_types The data type of each column
- * @param[in] col_map Pointer to the (column name hash -> solumn index) map in device memory.
+ * @param[in] col_map Pointer to the (column name hash -> column index) map in device memory.
  * nullptr is passed when the input file does not consist of objects.
  * @param[out] output_columns The output column data
  * @param[out] valid_fields The bitmaps indicating whether column fields are valid
@@ -491,9 +491,9 @@ __global__ void convert_data_to_columns_kernel(parse_options_view opts,
                                                device_span<char const> const data,
                                                device_span<uint64_t const> const row_offsets,
                                                device_span<data_type const> const column_types,
-                                               col_map_type *col_map,
-                                               device_span<void *const> const output_columns,
-                                               device_span<bitmask_type *const> const valid_fields,
+                                               col_map_type* col_map,
+                                               device_span<void* const> const output_columns,
+                                               device_span<bitmask_type* const> const valid_fields,
                                                device_span<cudf::size_type> const num_valid_fields)
 {
   const auto rec_id = threadIdx.x + (blockDim.x * blockIdx.x);
@@ -515,7 +515,7 @@ __global__ void convert_data_to_columns_kernel(parse_options_view opts,
     if (!serialized_trie_contains(opts.trie_na, {desc.value_begin, value_len})) {
       // Type dispatcher does not handle strings
       if (column_types[desc.column].id() == type_id::STRING) {
-        auto str_list           = static_cast<string_index_pair *>(output_columns[desc.column]);
+        auto str_list           = static_cast<string_index_pair*>(output_columns[desc.column]);
         str_list[rec_id].first  = desc.value_begin;
         str_list[rec_id].second = value_len;
 
@@ -536,7 +536,7 @@ __global__ void convert_data_to_columns_kernel(parse_options_view opts,
         }
       }
     } else if (column_types[desc.column].id() == type_id::STRING) {
-      auto str_list           = static_cast<string_index_pair *>(output_columns[desc.column]);
+      auto str_list           = static_cast<string_index_pair*>(output_columns[desc.column]);
       str_list[rec_id].first  = nullptr;
       str_list[rec_id].second = 0;
     }
@@ -562,7 +562,7 @@ __global__ void detect_data_types_kernel(
   parse_options_view const opts,
   device_span<char const> const data,
   device_span<uint64_t const> const row_offsets,
-  col_map_type *col_map,
+  col_map_type* col_map,
   int num_columns,
   device_span<cudf::io::column_type_histogram> const column_infos)
 {
@@ -645,8 +645,8 @@ __global__ void detect_data_types_kernel(
       atomicAdd(&column_infos[desc.column].bool_count, 1);
     } else if (digit_count == int_req_number_cnt) {
       bool is_negative       = (*desc.value_begin == '-');
-      char const *data_begin = desc.value_begin + (is_negative || (*desc.value_begin == '+'));
-      cudf::size_type *ptr   = cudf::io::gpu::infer_integral_field_counter(
+      char const* data_begin = desc.value_begin + (is_negative || (*desc.value_begin == '+'));
+      cudf::size_type* ptr   = cudf::io::gpu::infer_integral_field_counter(
         data_begin, data_begin + digit_count, is_negative, column_infos[desc.column]);
       atomicAdd(ptr, 1);
     } else if (is_like_float(
@@ -685,18 +685,18 @@ __global__ void detect_data_types_kernel(
  * @brief Input data range that contains a field in key:value format.
  */
 struct key_value_range {
-  char const *key_begin;
-  char const *key_end;
-  char const *value_begin;
-  char const *value_end;
+  char const* key_begin;
+  char const* key_end;
+  char const* value_begin;
+  char const* value_end;
 };
 
 /**
  * @brief Parse the next field in key:value format and return ranges of its parts.
  */
-__device__ key_value_range get_next_key_value_range(char const *begin,
-                                                    char const *end,
-                                                    parse_options_view const &opts)
+__device__ key_value_range get_next_key_value_range(char const* begin,
+                                                    char const* end,
+                                                    parse_options_view const& opts)
 {
   auto const key_range = get_next_key(begin, end, opts.quotechar);
 
@@ -721,7 +721,7 @@ __device__ key_value_range get_next_key_value_range(char const *begin,
 __global__ void collect_keys_info_kernel(parse_options_view const options,
                                          device_span<char const> const data,
                                          device_span<uint64_t const> const row_offsets,
-                                         unsigned long long int *keys_cnt,
+                                         unsigned long long int* keys_cnt,
                                          thrust::optional<mutable_table_device_view> keys_info)
 {
   auto const rec_id = threadIdx.x + (blockDim.x * blockIdx.x);
@@ -729,7 +729,7 @@ __global__ void collect_keys_info_kernel(parse_options_view const options,
 
   auto const row_data_range = get_row_data_range(data, row_offsets, rec_id);
 
-  auto advance = [&](const char *begin) {
+  auto advance = [&](const char* begin) {
     return get_next_key_value_range(begin, row_data_range.second, options);
   };
   for (auto field_range = advance(row_data_range.first);
@@ -751,13 +751,13 @@ __global__ void collect_keys_info_kernel(parse_options_view const options,
 /**
  * @copydoc cudf::io::json::gpu::convert_json_to_columns
  */
-void convert_json_to_columns(parse_options_view const &opts,
+void convert_json_to_columns(parse_options_view const& opts,
                              device_span<char const> const data,
                              device_span<uint64_t const> const row_offsets,
                              device_span<data_type const> const column_types,
-                             col_map_type *col_map,
-                             device_span<void *const> const output_columns,
-                             device_span<bitmask_type *const> const valid_fields,
+                             col_map_type* col_map,
+                             device_span<void* const> const output_columns,
+                             device_span<bitmask_type* const> const valid_fields,
                              device_span<cudf::size_type> num_valid_fields,
                              rmm::cuda_stream_view stream)
 {
@@ -779,12 +779,12 @@ void convert_json_to_columns(parse_options_view const &opts,
  */
 
 std::vector<cudf::io::column_type_histogram> detect_data_types(
-  const parse_options_view &options,
+  const parse_options_view& options,
   device_span<char const> const data,
   device_span<uint64_t const> const row_offsets,
   bool do_set_null_count,
   int num_columns,
-  col_map_type *col_map,
+  col_map_type* col_map,
   rmm::cuda_stream_view stream)
 {
   int block_size;
@@ -822,10 +822,10 @@ std::vector<cudf::io::column_type_histogram> detect_data_types(
 /**
  * @copydoc cudf::io::json::gpu::gpu_collect_keys_info
  */
-void collect_keys_info(parse_options_view const &options,
+void collect_keys_info(parse_options_view const& options,
                        device_span<char const> const data,
                        device_span<uint64_t const> const row_offsets,
-                       unsigned long long int *keys_cnt,
+                       unsigned long long int* keys_cnt,
                        thrust::optional<mutable_table_device_view> keys_info,
                        rmm::cuda_stream_view stream)
 {
diff --git a/cpp/src/io/json/json_gpu.h b/cpp/src/io/json/json_gpu.h
index 4a68ce48f20..7a6bce5e5a5 100644
--- a/cpp/src/io/json/json_gpu.h
+++ b/cpp/src/io/json/json_gpu.h
@@ -44,20 +44,20 @@ using col_map_type = concurrent_unordered_map<uint32_t, cudf::size_type>;
  * @param[in] data The entire data to read
  * @param[in] row_offsets The start of each data record
  * @param[in] dtypes The data type of each column
- * @param[in] col_map Pointer to the (column name hash -> solumn index) map in device memory.
+ * @param[in] col_map Pointer to the (column name hash -> column index) map in device memory.
  * nullptr is passed when the input file does not consist of objects.
  * @param[out] output_columns The output column data
  * @param[out] valid_fields The bitmaps indicating whether column fields are valid
  * @param[out] num_valid_fields The numbers of valid fields in columns
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
-void convert_json_to_columns(parse_options_view const &options,
+void convert_json_to_columns(parse_options_view const& options,
                              device_span<char const> data,
                              device_span<uint64_t const> row_offsets,
                              device_span<data_type const> column_types,
-                             col_map_type *col_map,
-                             device_span<void *const> output_columns,
-                             device_span<bitmask_type *const> valid_fields,
+                             col_map_type* col_map,
+                             device_span<void* const> output_columns,
+                             device_span<bitmask_type* const> valid_fields,
                              device_span<cudf::size_type> num_valid_fields,
                              rmm::cuda_stream_view stream);
 
@@ -68,19 +68,19 @@ void convert_json_to_columns(parse_options_view const &options,
  * @param[in] data Input data buffer
  * @param[in] row_offsets The offset of each row in the input
  * @param[in] num_columns The number of columns of input data
- * @param[in] col_map Pointer to the (column name hash -> solumn index) map in device memory.
+ * @param[in] col_map Pointer to the (column name hash -> column index) map in device memory.
  * nullptr is passed when the input file does not consist of objects.
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  *
  * @returns The count for each column data type
  */
 std::vector<cudf::io::column_type_histogram> detect_data_types(
-  parse_options_view const &options,
+  parse_options_view const& options,
   device_span<char const> data,
   device_span<uint64_t const> row_offsets,
   bool do_set_null_count,
   int num_columns,
-  col_map_type *col_map,
+  col_map_type* col_map,
   rmm::cuda_stream_view stream);
 
 /**
@@ -93,10 +93,10 @@ std::vector<cudf::io::column_type_histogram> detect_data_types(
  * @param[out] keys_info optional, information (offset, length, hash) for each found key
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
-void collect_keys_info(parse_options_view const &options,
+void collect_keys_info(parse_options_view const& options,
                        device_span<char const> data,
                        device_span<uint64_t const> row_offsets,
-                       unsigned long long int *keys_cnt,
+                       unsigned long long int* keys_cnt,
                        thrust::optional<mutable_table_device_view> keys_info,
                        rmm::cuda_stream_view stream);
 
diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu
index 4d5eee6cac7..b4395d6c965 100644
--- a/cpp/src/io/json/reader_impl.cu
+++ b/cpp/src/io/json/reader_impl.cu
@@ -134,9 +134,9 @@ col_map_ptr_type create_col_names_hash_map(column_view column_name_hashes,
  * @param[in] row_offsets Device array of row start locations in the input buffer
  * @param[in] stream CUDA stream used for device memory operations and kernel launches
  *
- * @return std::unique_ptr<table> cudf table with three columns (offsets, lenghts, hashes)
+ * @return std::unique_ptr<table> cudf table with three columns (offsets, lengths, hashes)
  */
-std::unique_ptr<table> create_json_keys_info_table(const parse_options_view &options,
+std::unique_ptr<table> create_json_keys_info_table(const parse_options_view& options,
                                                    device_span<char const> const data,
                                                    device_span<uint64_t const> const row_offsets,
                                                    rmm::cuda_stream_view stream)
@@ -167,7 +167,7 @@ std::unique_ptr<table> create_json_keys_info_table(const parse_options_view &opt
 /**
  * @brief Extract the keys from the JSON file the name offsets/lengths.
  */
-std::vector<std::string> create_key_strings(char const *h_data,
+std::vector<std::string> create_key_strings(char const* h_data,
                                             table_view sorted_info,
                                             rmm::cuda_stream_view stream)
 {
@@ -213,7 +213,7 @@ std::pair<std::vector<std::string>, col_map_ptr_type> reader::impl::get_json_obj
 {
   auto info = create_json_keys_info_table(
     opts_.view(),
-    device_span<char const>(static_cast<char const *>(data_.data()), data_.size()),
+    device_span<char const>(static_cast<char const*>(data_.data()), data_.size()),
     rec_starts,
     stream);
 
@@ -243,7 +243,7 @@ void reader::impl::ingest_raw_input(size_t range_offset, size_t range_size)
   // This allows only mapping of a subset of the file if using byte range
   if (sources_.empty()) {
     assert(!filepaths_.empty());
-    for (const auto &path : filepaths_) {
+    for (const auto& path : filepaths_) {
       sources_.emplace_back(datasource::create(path, range_offset, map_range_size));
     }
   }
@@ -251,12 +251,14 @@ void reader::impl::ingest_raw_input(size_t range_offset, size_t range_size)
   // Iterate through the user defined sources and read the contents into the local buffer
   CUDF_EXPECTS(!sources_.empty(), "No sources were defined");
   size_t total_source_size = 0;
-  for (const auto &source : sources_) { total_source_size += source->size(); }
+  for (const auto& source : sources_) {
+    total_source_size += source->size();
+  }
   total_source_size = total_source_size - range_offset;
 
   buffer_.resize(total_source_size);
   size_t bytes_read = 0;
-  for (const auto &source : sources_) {
+  for (const auto& source : sources_) {
     if (!source->is_empty()) {
       auto data_size = (map_range_size != 0) ? map_range_size : source->size();
       bytes_read += source->host_read(range_offset, data_size, &buffer_[bytes_read]);
@@ -282,12 +284,12 @@ void reader::impl::decompress_input(rmm::cuda_stream_view stream)
                            {{"gz", "gzip"}, {"zip", "zip"}, {"bz2", "bz2"}, {"xz", "xz"}});
   if (compression_type == "none") {
     // Do not use the owner vector here to avoid extra copy
-    uncomp_data_ = reinterpret_cast<const char *>(buffer_.data());
+    uncomp_data_ = reinterpret_cast<const char*>(buffer_.data());
     uncomp_size_ = buffer_.size();
   } else {
     uncomp_data_owner_ = get_uncompressed_data(  //
       host_span<char const>(                     //
-        reinterpret_cast<const char *>(buffer_.data()),
+        reinterpret_cast<const char*>(buffer_.data()),
         buffer_.size()),
       compression_type);
 
@@ -314,7 +316,7 @@ rmm::device_uvector<uint64_t> reader::impl::find_record_starts(rmm::cuda_stream_
 
   rmm::device_uvector<uint64_t> rec_starts(prefilter_count, stream);
 
-  auto *find_result_ptr = rec_starts.data();
+  auto* find_result_ptr = rec_starts.data();
   // Manually adding an extra row to account for the first row in the file
   if (byte_range_offset_ == 0) {
     find_result_ptr++;
@@ -372,7 +374,7 @@ rmm::device_uvector<uint64_t> reader::impl::find_record_starts(rmm::cuda_stream_
  * Only rows that need to be parsed are copied, based on the byte range
  * Also updates the array of record starts to match the device data offset.
  */
-void reader::impl::upload_data_to_device(rmm::device_uvector<uint64_t> &rec_starts,
+void reader::impl::upload_data_to_device(rmm::device_uvector<uint64_t>& rec_starts,
                                          rmm::cuda_stream_view stream)
 {
   size_t start_offset = 0;
@@ -472,7 +474,7 @@ void reader::impl::set_data_types(device_span<uint64_t const> rec_starts,
 
     // Assume that the dtype is in dictionary format only if all elements contain a colon
     const bool is_dict =
-      std::all_of(std::cbegin(dtype), std::cend(dtype), [](const std::string &s) {
+      std::all_of(std::cbegin(dtype), std::cend(dtype), [](const std::string& s) {
         return std::find(std::cbegin(s), std::cend(s), ':') != std::cend(s);
       });
 
@@ -487,7 +489,7 @@ void reader::impl::set_data_types(device_span<uint64_t const> rec_starts,
         std::cbegin(dtype),
         std::cend(dtype),
         std::inserter(col_type_map, col_type_map.end()),
-        [&](auto const &ts) {
+        [&](auto const& ts) {
           auto const [col_name, type_str] = split_on_colon(ts);
           return std::pair{std::string{col_name}, convert_string_to_dtype(std::string{type_str})};
         });
@@ -496,12 +498,12 @@ void reader::impl::set_data_types(device_span<uint64_t const> rec_starts,
       std::transform(std::cbegin(metadata_.column_names),
                      std::cend(metadata_.column_names),
                      std::back_inserter(dtypes_),
-                     [&](auto const &column_name) { return col_type_map[column_name]; });
+                     [&](auto const& column_name) { return col_type_map[column_name]; });
     } else {
       std::transform(std::cbegin(dtype),
                      std::cend(dtype),
                      std::back_inserter(dtypes_),
-                     [](auto const &col_dtype) { return convert_string_to_dtype(col_dtype); });
+                     [](auto const& col_dtype) { return convert_string_to_dtype(col_dtype); });
     }
   } else {
     CUDF_EXPECTS(rec_starts.size() != 0, "No data available for data type inference.\n");
@@ -510,14 +512,14 @@ void reader::impl::set_data_types(device_span<uint64_t const> rec_starts,
 
     auto const h_column_infos = cudf::io::json::gpu::detect_data_types(
       opts_.view(),
-      device_span<char const>(static_cast<char const *>(data_.data()), data_.size()),
+      device_span<char const>(static_cast<char const*>(data_.data()), data_.size()),
       rec_starts,
       do_set_null_count,
       num_columns,
       get_column_map_device_ptr(),
       stream);
 
-    auto get_type_id = [&](auto const &cinfo) {
+    auto get_type_id = [&](auto const& cinfo) {
       auto int_count_total =
         cinfo.big_int_count + cinfo.negative_small_int_count + cinfo.positive_small_int_count;
       if (cinfo.null_count == static_cast<int>(rec_starts.size())) {
@@ -545,7 +547,7 @@ void reader::impl::set_data_types(device_span<uint64_t const> rec_starts,
     std::transform(std::cbegin(h_column_infos),
                    std::cend(h_column_infos),
                    std::back_inserter(dtypes_),
-                   [&](auto const &cinfo) { return data_type{get_type_id(cinfo)}; });
+                   [&](auto const& cinfo) { return data_type{get_type_id(cinfo)}; });
   }
 }
 
@@ -562,8 +564,8 @@ table_with_metadata reader::impl::convert_data_to_table(device_span<uint64_t con
   }
 
   thrust::host_vector<data_type> h_dtypes(num_columns);
-  thrust::host_vector<void *> h_data(num_columns);
-  thrust::host_vector<bitmask_type *> h_valid(num_columns);
+  thrust::host_vector<void*> h_data(num_columns);
+  thrust::host_vector<bitmask_type*> h_valid(num_columns);
 
   for (size_t i = 0; i < num_columns; ++i) {
     h_dtypes[i] = dtypes_[i];
@@ -572,14 +574,14 @@ table_with_metadata reader::impl::convert_data_to_table(device_span<uint64_t con
   }
 
   auto d_dtypes = cudf::detail::make_device_uvector_async<data_type>(h_dtypes, stream);
-  auto d_data   = cudf::detail::make_device_uvector_async<void *>(h_data, stream);
-  auto d_valid  = cudf::detail::make_device_uvector_async<cudf::bitmask_type *>(h_valid, stream);
+  auto d_data   = cudf::detail::make_device_uvector_async<void*>(h_data, stream);
+  auto d_valid  = cudf::detail::make_device_uvector_async<cudf::bitmask_type*>(h_valid, stream);
   auto d_valid_counts =
     cudf::detail::make_zeroed_device_uvector_async<cudf::size_type>(num_columns, stream);
 
   cudf::io::json::gpu::convert_json_to_columns(
     opts_.view(),
-    device_span<char const>(static_cast<char const *>(data_.data()), data_.size()),
+    device_span<char const>(static_cast<char const*>(data_.data()), data_.size()),
     rec_starts,
     d_dtypes,
     get_column_map_device_ptr(),
@@ -632,11 +634,11 @@ table_with_metadata reader::impl::convert_data_to_table(device_span<uint64_t con
   return table_with_metadata{std::make_unique<table>(std::move(out_columns)), metadata_};
 }
 
-reader::impl::impl(std::vector<std::unique_ptr<datasource>> &&sources,
-                   std::vector<std::string> const &filepaths,
-                   json_reader_options const &options,
+reader::impl::impl(std::vector<std::unique_ptr<datasource>>&& sources,
+                   std::vector<std::string> const& filepaths,
+                   json_reader_options const& options,
                    rmm::cuda_stream_view stream,
-                   rmm::mr::device_memory_resource *mr)
+                   rmm::mr::device_memory_resource* mr)
   : options_(options), mr_(mr), sources_(std::move(sources)), filepaths_(filepaths)
 {
   CUDF_EXPECTS(options_.is_enabled_lines(), "Only JSON Lines format is currently supported.\n");
@@ -657,7 +659,7 @@ reader::impl::impl(std::vector<std::unique_ptr<datasource>> &&sources,
  *
  * @return Table and its metadata
  */
-table_with_metadata reader::impl::read(json_reader_options const &options,
+table_with_metadata reader::impl::read(json_reader_options const& options,
                                        rmm::cuda_stream_view stream)
 {
   auto range_offset = options.get_byte_range_offset();
@@ -686,10 +688,10 @@ table_with_metadata reader::impl::read(json_reader_options const &options,
 }
 
 // Forward to implementation
-reader::reader(std::vector<std::string> const &filepaths,
-               json_reader_options const &options,
+reader::reader(std::vector<std::string> const& filepaths,
+               json_reader_options const& options,
                rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource *mr)
+               rmm::mr::device_memory_resource* mr)
 {
   // Delay actual instantiation of data source until read to allow for
   // partial memory mapping of file using byte ranges
@@ -698,10 +700,10 @@ reader::reader(std::vector<std::string> const &filepaths,
 }
 
 // Forward to implementation
-reader::reader(std::vector<std::unique_ptr<cudf::io::datasource>> &&sources,
-               json_reader_options const &options,
+reader::reader(std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
+               json_reader_options const& options,
                rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource *mr)
+               rmm::mr::device_memory_resource* mr)
 {
   std::vector<std::string> file_paths = {};  // Empty filepaths
   _impl = std::make_unique<impl>(std::move(sources), file_paths, options, stream, mr);
@@ -711,7 +713,7 @@ reader::reader(std::vector<std::unique_ptr<cudf::io::datasource>> &&sources,
 reader::~reader() = default;
 
 // Forward to implementation
-table_with_metadata reader::read(json_reader_options const &options, rmm::cuda_stream_view stream)
+table_with_metadata reader::read(json_reader_options const& options, rmm::cuda_stream_view stream)
 {
   return table_with_metadata{_impl->read(options, stream)};
 }
diff --git a/cpp/src/io/json/reader_impl.hpp b/cpp/src/io/json/reader_impl.hpp
index f22653303ce..bbda7e9ba74 100644
--- a/cpp/src/io/json/reader_impl.hpp
+++ b/cpp/src/io/json/reader_impl.hpp
@@ -44,7 +44,7 @@ using namespace cudf::io::json;
 using namespace cudf::io;
 
 using col_map_type     = cudf::io::json::gpu::col_map_type;
-using col_map_ptr_type = std::unique_ptr<col_map_type, std::function<void(col_map_type *)>>;
+using col_map_ptr_type = std::unique_ptr<col_map_type, std::function<void(col_map_type*)>>;
 
 /**
  * @brief Class used to parse Json input and convert it into gdf columns.
@@ -54,13 +54,13 @@ class reader::impl {
  private:
   const json_reader_options options_{};
 
-  rmm::mr::device_memory_resource *mr_ = nullptr;
+  rmm::mr::device_memory_resource* mr_ = nullptr;
 
   std::vector<std::unique_ptr<datasource>> sources_;
   std::vector<std::string> filepaths_;
   std::vector<uint8_t> buffer_;
 
-  const char *uncomp_data_ = nullptr;
+  const char* uncomp_data_ = nullptr;
   size_t uncomp_size_      = 0;
 
   // Used when the input data is compressed, to ensure the allocated uncompressed data is freed
@@ -87,7 +87,7 @@ class reader::impl {
    * @brief Sets the column map data member and makes a device copy to be used as a kernel
    * parameter.
    */
-  void set_column_map(col_map_ptr_type &&map, rmm::cuda_stream_view stream)
+  void set_column_map(col_map_ptr_type&& map, rmm::cuda_stream_view stream)
   {
     key_to_col_idx_map_ = std::move(map);
     d_key_col_map_ =
@@ -145,7 +145,7 @@ class reader::impl {
    * Only rows that need to be parsed are copied, based on the byte range
    * Also updates the array of record starts to match the device data offset.
    */
-  void upload_data_to_device(rmm::device_uvector<uint64_t> &rec_starts,
+  void upload_data_to_device(rmm::device_uvector<uint64_t>& rec_starts,
                              rmm::cuda_stream_view stream);
 
   /**
@@ -183,11 +183,11 @@ class reader::impl {
   /**
    * @brief Constructor from a dataset source with reader options.
    */
-  explicit impl(std::vector<std::unique_ptr<datasource>> &&sources,
-                std::vector<std::string> const &filepaths,
-                json_reader_options const &options,
+  explicit impl(std::vector<std::unique_ptr<datasource>>&& sources,
+                std::vector<std::string> const& filepaths,
+                json_reader_options const& options,
                 rmm::cuda_stream_view stream,
-                rmm::mr::device_memory_resource *mr);
+                rmm::mr::device_memory_resource* mr);
 
   /**
    * @brief Read an entire set or a subset of data from the source
@@ -197,7 +197,7 @@ class reader::impl {
    *
    * @return Table and its metadata
    */
-  table_with_metadata read(json_reader_options const &options, rmm::cuda_stream_view stream);
+  table_with_metadata read(json_reader_options const& options, rmm::cuda_stream_view stream);
 };
 
 }  // namespace json
diff --git a/cpp/src/io/orc/dict_enc.cu b/cpp/src/io/orc/dict_enc.cu
index e69a61bde66..ef39e475b93 100644
--- a/cpp/src/io/orc/dict_enc.cu
+++ b/cpp/src/io/orc/dict_enc.cu
@@ -54,7 +54,7 @@ static inline __device__ uint32_t hash_string(const string_view val)
   if (val.empty()) {
     return 0;
   } else {
-    char const *ptr = val.data();
+    char const* ptr = val.data();
     uint32_t len    = val.size_bytes();
     return (ptr[0] + (ptr[len - 1] << 5) + (len << 10)) & ((1 << init_hash_bits) - 1);
   }
@@ -68,13 +68,13 @@ static inline __device__ uint32_t hash_string(const string_view val)
  * @param[in] temp_storage shared memory storage to scan non-null positions
  */
 template <int block_size, typename Storage>
-static __device__ void LoadNonNullIndices(volatile dictinit_state_s *s,
+static __device__ void LoadNonNullIndices(volatile dictinit_state_s* s,
                                           int t,
-                                          Storage &temp_storage)
+                                          Storage& temp_storage)
 {
   if (t == 0) { s->nnz = 0; }
   for (uint32_t i = 0; i < s->chunk.num_rows; i += block_size) {
-    const uint32_t *valid_map = s->chunk.leaf_column->null_mask();
+    const uint32_t* valid_map = s->chunk.leaf_column->null_mask();
     auto column_offset        = s->chunk.leaf_column->offset();
     uint32_t is_valid, nz_pos;
     if (t < block_size / 32) {
@@ -120,12 +120,12 @@ static __device__ void LoadNonNullIndices(volatile dictinit_state_s *s,
 // blockDim {block_size,1,1}
 template <int block_size>
 __global__ void __launch_bounds__(block_size, 2)
-  gpuInitDictionaryIndices(DictionaryChunk *chunks,
+  gpuInitDictionaryIndices(DictionaryChunk* chunks,
                            const table_device_view view,
-                           uint32_t *dict_data,
-                           uint32_t *dict_index,
+                           uint32_t* dict_data,
+                           uint32_t* dict_index,
                            size_t row_index_stride,
-                           size_type *str_col_ids,
+                           size_type* str_col_ids,
                            uint32_t num_columns)
 {
   __shared__ __align__(16) dictinit_state_s state_g;
@@ -138,14 +138,14 @@ __global__ void __launch_bounds__(block_size, 2)
     typename block_scan::TempStorage scan_storage;
   } temp_storage;
 
-  dictinit_state_s *const s = &state_g;
+  dictinit_state_s* const s = &state_g;
   uint32_t col_id           = blockIdx.x;
   uint32_t group_id         = blockIdx.y;
   uint32_t nnz, start_row, dict_char_count;
   int t = threadIdx.x;
 
   if (t == 0) {
-    column_device_view *leaf_column_view = view.begin() + str_col_ids[col_id];
+    column_device_view* leaf_column_view = view.begin() + str_col_ids[col_id];
     s->chunk                             = chunks[group_id * num_columns + col_id];
     s->chunk.leaf_column                 = leaf_column_view;
     s->chunk.dict_data =
@@ -305,21 +305,21 @@ __global__ void __launch_bounds__(block_size, 2)
  */
 // blockDim {1024,1,1}
 extern "C" __global__ void __launch_bounds__(1024)
-  gpuCompactChunkDictionaries(StripeDictionary *stripes,
-                              DictionaryChunk const *chunks,
+  gpuCompactChunkDictionaries(StripeDictionary* stripes,
+                              DictionaryChunk const* chunks,
                               uint32_t num_columns)
 {
   __shared__ __align__(16) StripeDictionary stripe_g;
   __shared__ __align__(16) DictionaryChunk chunk_g;
-  __shared__ const uint32_t *volatile ck_curptr_g;
+  __shared__ const uint32_t* volatile ck_curptr_g;
   __shared__ uint32_t volatile ck_curlen_g;
 
   uint32_t col_id    = blockIdx.x;
   uint32_t stripe_id = blockIdx.y;
   uint32_t chunk_len;
   int t = threadIdx.x;
-  const uint32_t *src;
-  uint32_t *dst;
+  const uint32_t* src;
+  uint32_t* dst;
 
   if (t == 0) stripe_g = stripes[stripe_id * num_columns + col_id];
   __syncthreads();
@@ -365,7 +365,7 @@ struct build_state_s {
 // blockDim {1024,1,1}
 template <int block_size>
 __global__ void __launch_bounds__(block_size)
-  gpuBuildStripeDictionaries(StripeDictionary *stripes, uint32_t num_columns)
+  gpuBuildStripeDictionaries(StripeDictionary* stripes, uint32_t num_columns)
 {
   __shared__ __align__(16) build_state_s state_g;
   using block_reduce = cub::BlockReduce<uint32_t, block_size>;
@@ -375,7 +375,7 @@ __global__ void __launch_bounds__(block_size)
     typename block_scan::TempStorage scan_storage;
   } temp_storage;
 
-  build_state_s *const s = &state_g;
+  build_state_s* const s = &state_g;
   uint32_t col_id        = blockIdx.x;
   uint32_t stripe_id     = blockIdx.y;
   uint32_t num_strings;
@@ -427,12 +427,12 @@ __global__ void __launch_bounds__(block_size)
 /**
  * @copydoc cudf::io::orc::gpu::InitDictionaryIndices
  */
-void InitDictionaryIndices(const table_device_view &view,
-                           DictionaryChunk *chunks,
-                           uint32_t *dict_data,
-                           uint32_t *dict_index,
+void InitDictionaryIndices(const table_device_view& view,
+                           DictionaryChunk* chunks,
+                           uint32_t* dict_data,
+                           uint32_t* dict_index,
                            size_t row_index_stride,
-                           size_type *str_col_ids,
+                           size_type* str_col_ids,
                            uint32_t num_columns,
                            uint32_t num_rowgroups,
                            rmm::cuda_stream_view stream)
@@ -447,9 +447,9 @@ void InitDictionaryIndices(const table_device_view &view,
 /**
  * @copydoc cudf::io::orc::gpu::BuildStripeDictionaries
  */
-void BuildStripeDictionaries(StripeDictionary *stripes,
-                             StripeDictionary *stripes_host,
-                             DictionaryChunk const *chunks,
+void BuildStripeDictionaries(StripeDictionary* stripes,
+                             StripeDictionary* stripes_host,
+                             DictionaryChunk const* chunks,
                              uint32_t num_stripes,
                              uint32_t num_rowgroups,
                              uint32_t num_columns,
@@ -463,12 +463,12 @@ void BuildStripeDictionaries(StripeDictionary *stripes,
     if (stripes_host[i].dict_data != nullptr) {
       thrust::device_ptr<uint32_t> dict_data_ptr =
         thrust::device_pointer_cast(stripes_host[i].dict_data);
-      column_device_view *string_column = stripes_host[i].leaf_column;
+      column_device_view* string_column = stripes_host[i].leaf_column;
       // NOTE: Requires the --expt-extended-lambda nvcc flag
       thrust::sort(rmm::exec_policy(stream),
                    dict_data_ptr,
                    dict_data_ptr + stripes_host[i].num_strings,
-                   [string_column] __device__(const uint32_t &lhs, const uint32_t &rhs) {
+                   [string_column] __device__(const uint32_t& lhs, const uint32_t& rhs) {
                      return string_column->element<string_view>(lhs) <
                             string_column->element<string_view>(rhs);
                    });
diff --git a/cpp/src/io/orc/orc.cpp b/cpp/src/io/orc/orc.cpp
index ea6d6b6ac85..287364c3191 100644
--- a/cpp/src/io/orc/orc.cpp
+++ b/cpp/src/io/orc/orc.cpp
@@ -23,7 +23,7 @@ namespace cudf {
 namespace io {
 namespace orc {
 
-uint32_t ProtobufReader::read_field_size(const uint8_t *end)
+uint32_t ProtobufReader::read_field_size(const uint8_t* end)
 {
   auto const size = get<uint32_t>();
   CUDF_EXPECTS(size <= static_cast<uint32_t>(end - m_cur), "Protobuf parsing out of bounds");
@@ -37,13 +37,11 @@ void ProtobufReader::skip_struct_field(int t)
     case PB_TYPE_FIXED64: skip_bytes(8); break;
     case PB_TYPE_FIXEDLEN: skip_bytes(get<uint32_t>()); break;
     case PB_TYPE_FIXED32: skip_bytes(4); break;
-    default:
-      // printf("invalid type (%d)\n", t);
-      break;
+    default: break;
   }
 }
 
-void ProtobufReader::read(PostScript &s, size_t maxlen)
+void ProtobufReader::read(PostScript& s, size_t maxlen)
 {
   auto op = std::make_tuple(make_field_reader(1, s.footerLength),
                             make_field_reader(2, s.compression),
@@ -54,7 +52,7 @@ void ProtobufReader::read(PostScript &s, size_t maxlen)
   function_builder(s, maxlen, op);
 }
 
-void ProtobufReader::read(FileFooter &s, size_t maxlen)
+void ProtobufReader::read(FileFooter& s, size_t maxlen)
 {
   auto op = std::make_tuple(make_field_reader(1, s.headerLength),
                             make_field_reader(2, s.contentLength),
@@ -67,7 +65,7 @@ void ProtobufReader::read(FileFooter &s, size_t maxlen)
   function_builder(s, maxlen, op);
 }
 
-void ProtobufReader::read(StripeInformation &s, size_t maxlen)
+void ProtobufReader::read(StripeInformation& s, size_t maxlen)
 {
   auto op = std::make_tuple(make_field_reader(1, s.offset),
                             make_field_reader(2, s.indexLength),
@@ -77,7 +75,7 @@ void ProtobufReader::read(StripeInformation &s, size_t maxlen)
   function_builder(s, maxlen, op);
 }
 
-void ProtobufReader::read(SchemaType &s, size_t maxlen)
+void ProtobufReader::read(SchemaType& s, size_t maxlen)
 {
   auto op = std::make_tuple(make_field_reader(1, s.kind),
                             make_packed_field_reader(2, s.subtypes),
@@ -88,13 +86,13 @@ void ProtobufReader::read(SchemaType &s, size_t maxlen)
   function_builder(s, maxlen, op);
 }
 
-void ProtobufReader::read(UserMetadataItem &s, size_t maxlen)
+void ProtobufReader::read(UserMetadataItem& s, size_t maxlen)
 {
   auto op = std::make_tuple(make_field_reader(1, s.name), make_field_reader(2, s.value));
   function_builder(s, maxlen, op);
 }
 
-void ProtobufReader::read(StripeFooter &s, size_t maxlen)
+void ProtobufReader::read(StripeFooter& s, size_t maxlen)
 {
   auto op = std::make_tuple(make_field_reader(1, s.streams),
                             make_field_reader(2, s.columns),
@@ -102,7 +100,7 @@ void ProtobufReader::read(StripeFooter &s, size_t maxlen)
   function_builder(s, maxlen, op);
 }
 
-void ProtobufReader::read(Stream &s, size_t maxlen)
+void ProtobufReader::read(Stream& s, size_t maxlen)
 {
   auto op = std::make_tuple(make_field_reader(1, s.kind),
                             make_field_reader(2, s.column_id),
@@ -110,59 +108,59 @@ void ProtobufReader::read(Stream &s, size_t maxlen)
   function_builder(s, maxlen, op);
 }
 
-void ProtobufReader::read(ColumnEncoding &s, size_t maxlen)
+void ProtobufReader::read(ColumnEncoding& s, size_t maxlen)
 {
   auto op = std::make_tuple(make_field_reader(1, s.kind), make_field_reader(2, s.dictionarySize));
   function_builder(s, maxlen, op);
 }
 
-void ProtobufReader::read(integer_statistics &s, size_t maxlen)
+void ProtobufReader::read(integer_statistics& s, size_t maxlen)
 {
   auto op = std::make_tuple(
     make_field_reader(1, s.minimum), make_field_reader(2, s.maximum), make_field_reader(3, s.sum));
   function_builder(s, maxlen, op);
 }
 
-void ProtobufReader::read(double_statistics &s, size_t maxlen)
+void ProtobufReader::read(double_statistics& s, size_t maxlen)
 {
   auto op = std::make_tuple(
     make_field_reader(1, s.minimum), make_field_reader(2, s.maximum), make_field_reader(3, s.sum));
   function_builder(s, maxlen, op);
 }
 
-void ProtobufReader::read(string_statistics &s, size_t maxlen)
+void ProtobufReader::read(string_statistics& s, size_t maxlen)
 {
   auto op = std::make_tuple(
     make_field_reader(1, s.minimum), make_field_reader(2, s.maximum), make_field_reader(3, s.sum));
   function_builder(s, maxlen, op);
 }
 
-void ProtobufReader::read(bucket_statistics &s, size_t maxlen)
+void ProtobufReader::read(bucket_statistics& s, size_t maxlen)
 {
   auto op = std::make_tuple(make_packed_field_reader(1, s.count));
   function_builder(s, maxlen, op);
 }
 
-void ProtobufReader::read(decimal_statistics &s, size_t maxlen)
+void ProtobufReader::read(decimal_statistics& s, size_t maxlen)
 {
   auto op = std::make_tuple(
     make_field_reader(1, s.minimum), make_field_reader(2, s.maximum), make_field_reader(3, s.sum));
   function_builder(s, maxlen, op);
 }
 
-void ProtobufReader::read(date_statistics &s, size_t maxlen)
+void ProtobufReader::read(date_statistics& s, size_t maxlen)
 {
   auto op = std::make_tuple(make_field_reader(1, s.minimum), make_field_reader(2, s.maximum));
   function_builder(s, maxlen, op);
 }
 
-void ProtobufReader::read(binary_statistics &s, size_t maxlen)
+void ProtobufReader::read(binary_statistics& s, size_t maxlen)
 {
   auto op = std::make_tuple(make_field_reader(1, s.sum));
   function_builder(s, maxlen, op);
 }
 
-void ProtobufReader::read(timestamp_statistics &s, size_t maxlen)
+void ProtobufReader::read(timestamp_statistics& s, size_t maxlen)
 {
   auto op = std::make_tuple(make_field_reader(1, s.minimum),
                             make_field_reader(2, s.maximum),
@@ -171,7 +169,7 @@ void ProtobufReader::read(timestamp_statistics &s, size_t maxlen)
   function_builder(s, maxlen, op);
 }
 
-void ProtobufReader::read(column_statistics &s, size_t maxlen)
+void ProtobufReader::read(column_statistics& s, size_t maxlen)
 {
   auto op = std::make_tuple(make_field_reader(1, s.number_of_values),
                             make_field_reader(2, s.int_stats),
@@ -185,13 +183,13 @@ void ProtobufReader::read(column_statistics &s, size_t maxlen)
   function_builder(s, maxlen, op);
 }
 
-void ProtobufReader::read(StripeStatistics &s, size_t maxlen)
+void ProtobufReader::read(StripeStatistics& s, size_t maxlen)
 {
   auto op = std::make_tuple(make_raw_field_reader(1, s.colStats));
   function_builder(s, maxlen, op);
 }
 
-void ProtobufReader::read(Metadata &s, size_t maxlen)
+void ProtobufReader::read(Metadata& s, size_t maxlen)
 {
   auto op = std::make_tuple(make_field_reader(1, s.stripeStats));
   function_builder(s, maxlen, op);
@@ -245,7 +243,7 @@ void ProtobufWriter::put_row_index_entry(int32_t present_blk,
   m_buf->data()[lpos + 2] = (uint8_t)(sz);
 }
 
-size_t ProtobufWriter::write(const PostScript &s)
+size_t ProtobufWriter::write(const PostScript& s)
 {
   ProtobufFieldWriter w(this);
   w.field_uint(1, s.footerLength);
@@ -257,7 +255,7 @@ size_t ProtobufWriter::write(const PostScript &s)
   return w.value();
 }
 
-size_t ProtobufWriter::write(const FileFooter &s)
+size_t ProtobufWriter::write(const FileFooter& s)
 {
   ProtobufFieldWriter w(this);
   w.field_uint(1, s.headerLength);
@@ -271,7 +269,7 @@ size_t ProtobufWriter::write(const FileFooter &s)
   return w.value();
 }
 
-size_t ProtobufWriter::write(const StripeInformation &s)
+size_t ProtobufWriter::write(const StripeInformation& s)
 {
   ProtobufFieldWriter w(this);
   w.field_uint(1, s.offset);
@@ -282,7 +280,7 @@ size_t ProtobufWriter::write(const StripeInformation &s)
   return w.value();
 }
 
-size_t ProtobufWriter::write(const SchemaType &s)
+size_t ProtobufWriter::write(const SchemaType& s)
 {
   ProtobufFieldWriter w(this);
   w.field_uint(1, s.kind);
@@ -294,7 +292,7 @@ size_t ProtobufWriter::write(const SchemaType &s)
   return w.value();
 }
 
-size_t ProtobufWriter::write(const UserMetadataItem &s)
+size_t ProtobufWriter::write(const UserMetadataItem& s)
 {
   ProtobufFieldWriter w(this);
   w.field_string(1, s.name);
@@ -302,7 +300,7 @@ size_t ProtobufWriter::write(const UserMetadataItem &s)
   return w.value();
 }
 
-size_t ProtobufWriter::write(const StripeFooter &s)
+size_t ProtobufWriter::write(const StripeFooter& s)
 {
   ProtobufFieldWriter w(this);
   w.field_repeated_struct(1, s.streams);
@@ -311,7 +309,7 @@ size_t ProtobufWriter::write(const StripeFooter &s)
   return w.value();
 }
 
-size_t ProtobufWriter::write(const Stream &s)
+size_t ProtobufWriter::write(const Stream& s)
 {
   ProtobufFieldWriter w(this);
   w.field_uint(1, s.kind);
@@ -320,7 +318,7 @@ size_t ProtobufWriter::write(const Stream &s)
   return w.value();
 }
 
-size_t ProtobufWriter::write(const ColumnEncoding &s)
+size_t ProtobufWriter::write(const ColumnEncoding& s)
 {
   ProtobufFieldWriter w(this);
   w.field_uint(1, s.kind);
@@ -328,14 +326,14 @@ size_t ProtobufWriter::write(const ColumnEncoding &s)
   return w.value();
 }
 
-size_t ProtobufWriter::write(const StripeStatistics &s)
+size_t ProtobufWriter::write(const StripeStatistics& s)
 {
   ProtobufFieldWriter w(this);
   w.field_repeated_struct_blob(1, s.colStats);
   return w.value();
 }
 
-size_t ProtobufWriter::write(const Metadata &s)
+size_t ProtobufWriter::write(const Metadata& s)
 {
   ProtobufFieldWriter w(this);
   w.field_repeated_struct(1, s.stripeStats);
@@ -376,7 +374,7 @@ OrcDecompressor::OrcDecompressor(CompressionKind kind, uint32_t blockSize)
  *
  * @returns pointer to uncompressed data, nullptr if error
  */
-const uint8_t *OrcDecompressor::Decompress(const uint8_t *srcBytes, size_t srcLen, size_t *dstLen)
+const uint8_t* OrcDecompressor::Decompress(const uint8_t* srcBytes, size_t srcLen, size_t* dstLen)
 {
   // If uncompressed, just pass-through the input
   if (m_kind == NONE) {
@@ -431,7 +429,7 @@ const uint8_t *OrcDecompressor::Decompress(const uint8_t *srcBytes, size_t srcLe
   return m_buf.data();
 }
 
-metadata::metadata(datasource *const src) : source(src)
+metadata::metadata(datasource* const src) : source(src)
 {
   const auto len         = source->size();
   const auto max_ps_size = std::min(len, static_cast<size_t>(256));
@@ -439,7 +437,7 @@ metadata::metadata(datasource *const src) : source(src)
   // Read uncompressed postscript section (max 255 bytes + 1 byte for length)
   auto buffer            = source->host_read(len - max_ps_size, max_ps_size);
   const size_t ps_length = buffer->data()[max_ps_size - 1];
-  const uint8_t *ps_data = &buffer->data()[max_ps_size - ps_length - 1];
+  const uint8_t* ps_data = &buffer->data()[max_ps_size - ps_length - 1];
   ProtobufReader(ps_data, ps_length).read(ps);
   CUDF_EXPECTS(ps.footerLength + ps_length < len, "Invalid footer length");
 
@@ -461,30 +459,23 @@ metadata::metadata(datasource *const src) : source(src)
   auto md_data     = decompressor->Decompress(buffer->data(), ps.metadataLength, &md_length);
   orc::ProtobufReader(md_data, md_length).read(md);
 
-  // Initilize the column names
+  // Initialize the column names
   init_column_names();
 }
 
 void metadata::init_column_names() const
 {
   auto const schema_idxs = get_schema_indexes();
-  auto const &types      = ff.types;
+  auto const& types      = ff.types;
   for (int32_t col_id = 0; col_id < get_num_columns(); ++col_id) {
     std::string col_name;
-    uint32_t parent_idx = col_id;
-    uint32_t idx        = col_id;
-    do {
-      idx        = parent_idx;
-      parent_idx = (idx < types.size()) ? static_cast<uint32_t>(schema_idxs[idx].parent) : ~0;
-      if (parent_idx >= types.size()) break;
-
-      auto const field_idx =
-        (parent_idx < types.size()) ? static_cast<uint32_t>(schema_idxs[idx].field) : ~0;
+    if (schema_idxs[col_id].parent >= 0 and schema_idxs[col_id].field >= 0) {
+      auto const parent_idx = static_cast<uint32_t>(schema_idxs[col_id].parent);
+      auto const field_idx  = static_cast<uint32_t>(schema_idxs[col_id].field);
       if (field_idx < types[parent_idx].fieldNames.size()) {
-        col_name =
-          types[parent_idx].fieldNames[field_idx] + (col_name.empty() ? "" : ("." + col_name));
+        col_name = types[parent_idx].fieldNames[field_idx];
       }
-    } while (parent_idx != idx);
+    }
     // If we have no name (root column), generate a name
     column_names.push_back(col_name.empty() ? "col" + std::to_string(col_id) : col_name);
   }
@@ -496,7 +487,7 @@ std::vector<metadata::schema_indexes> metadata::get_schema_indexes() const
 
   auto const schema_size = static_cast<uint32_t>(result.size());
   for (uint32_t i = 0; i < schema_size; i++) {
-    auto const &subtypes    = ff.types[i].subtypes;
+    auto const& subtypes    = ff.types[i].subtypes;
     auto const num_children = static_cast<uint32_t>(subtypes.size());
     if (result[i].parent == -1) {  // Not initialized
       result[i].parent = i;        // set root node as its own parent
diff --git a/cpp/src/io/orc/orc.h b/cpp/src/io/orc/orc.h
index e6fec8afb0f..474f404be0f 100644
--- a/cpp/src/io/orc/orc.h
+++ b/cpp/src/io/orc/orc.h
@@ -135,32 +135,32 @@ struct Metadata {
  */
 class ProtobufReader {
  public:
-  ProtobufReader(const uint8_t *base, size_t len) : m_base(base), m_cur(base), m_end(base + len) {}
+  ProtobufReader(const uint8_t* base, size_t len) : m_base(base), m_cur(base), m_end(base + len) {}
 
   template <typename T>
-  void read(T &s)
+  void read(T& s)
   {
     read(s, m_end - m_cur);
   }
-  void read(PostScript &, size_t maxlen);
-  void read(FileFooter &, size_t maxlen);
-  void read(StripeInformation &, size_t maxlen);
-  void read(SchemaType &, size_t maxlen);
-  void read(UserMetadataItem &, size_t maxlen);
-  void read(StripeFooter &, size_t maxlen);
-  void read(Stream &, size_t maxlen);
-  void read(ColumnEncoding &, size_t maxlen);
-  void read(integer_statistics &, size_t maxlen);
-  void read(double_statistics &, size_t maxlen);
-  void read(string_statistics &, size_t maxlen);
-  void read(bucket_statistics &, size_t maxlen);
-  void read(decimal_statistics &, size_t maxlen);
-  void read(date_statistics &, size_t maxlen);
-  void read(binary_statistics &, size_t maxlen);
-  void read(timestamp_statistics &, size_t maxlen);
-  void read(column_statistics &, size_t maxlen);
-  void read(StripeStatistics &, size_t maxlen);
-  void read(Metadata &, size_t maxlen);
+  void read(PostScript&, size_t maxlen);
+  void read(FileFooter&, size_t maxlen);
+  void read(StripeInformation&, size_t maxlen);
+  void read(SchemaType&, size_t maxlen);
+  void read(UserMetadataItem&, size_t maxlen);
+  void read(StripeFooter&, size_t maxlen);
+  void read(Stream&, size_t maxlen);
+  void read(ColumnEncoding&, size_t maxlen);
+  void read(integer_statistics&, size_t maxlen);
+  void read(double_statistics&, size_t maxlen);
+  void read(string_statistics&, size_t maxlen);
+  void read(bucket_statistics&, size_t maxlen);
+  void read(decimal_statistics&, size_t maxlen);
+  void read(date_statistics&, size_t maxlen);
+  void read(binary_statistics&, size_t maxlen);
+  void read(timestamp_statistics&, size_t maxlen);
+  void read(column_statistics&, size_t maxlen);
+  void read(StripeStatistics&, size_t maxlen);
+  void read(Metadata&, size_t maxlen);
 
  private:
   template <int index>
@@ -178,11 +178,11 @@ class ProtobufReader {
   void skip_struct_field(int t);
 
   template <typename T, typename... Operator>
-  void function_builder(T &s, size_t maxlen, std::tuple<Operator...> &op);
+  void function_builder(T& s, size_t maxlen, std::tuple<Operator...>& op);
 
   template <typename base_t,
             typename std::enable_if_t<!std::is_arithmetic<base_t>::value and
-                                      !std::is_enum<base_t>::value> * = nullptr>
+                                      !std::is_enum<base_t>::value>* = nullptr>
   int static constexpr encode_field_number_base(int field_number) noexcept
   {
     return (field_number * 8) + PB_TYPE_FIXEDLEN;
@@ -190,21 +190,21 @@ class ProtobufReader {
 
   template <typename base_t,
             typename std::enable_if_t<std::is_integral<base_t>::value or
-                                      std::is_enum<base_t>::value> * = nullptr>
+                                      std::is_enum<base_t>::value>* = nullptr>
   int static constexpr encode_field_number_base(int field_number) noexcept
   {
     return (field_number * 8) + PB_TYPE_VARINT;
   }
 
   template <typename base_t,
-            typename std::enable_if_t<std::is_same<base_t, float>::value> * = nullptr>
+            typename std::enable_if_t<std::is_same<base_t, float>::value>* = nullptr>
   int static constexpr encode_field_number_base(int field_number) noexcept
   {
     return (field_number * 8) + PB_TYPE_FIXED32;
   }
 
   template <typename base_t,
-            typename std::enable_if_t<std::is_same<base_t, double>::value> * = nullptr>
+            typename std::enable_if_t<std::is_same<base_t, double>::value>* = nullptr>
   int static constexpr encode_field_number_base(int field_number) noexcept
   {
     return (field_number * 8) + PB_TYPE_FIXED64;
@@ -212,7 +212,7 @@ class ProtobufReader {
 
   template <typename T,
             typename std::enable_if_t<!std::is_class<T>::value or
-                                      std::is_same<T, std::string>::value> * = nullptr>
+                                      std::is_same<T, std::string>::value>* = nullptr>
   int static constexpr encode_field_number(int field_number) noexcept
   {
     return encode_field_number_base<T>(field_number);
@@ -220,8 +220,8 @@ class ProtobufReader {
 
   // containters change the field number encoding
   template <typename T,
-            typename std::enable_if_t<std::is_same<T, std::vector<typename T::value_type>>::value>
-              * = nullptr>
+            typename std::enable_if_t<
+              std::is_same<T, std::vector<typename T::value_type>>::value>* = nullptr>
   int static constexpr encode_field_number(int field_number) noexcept
   {
     return encode_field_number_base<T>(field_number);
@@ -229,49 +229,49 @@ class ProtobufReader {
 
   // optional fields don't change the field number encoding
   template <typename T,
-            typename std::enable_if_t<std::is_same<T, std::optional<typename T::value_type>>::value>
-              * = nullptr>
+            typename std::enable_if_t<
+              std::is_same<T, std::optional<typename T::value_type>>::value>* = nullptr>
   int static constexpr encode_field_number(int field_number) noexcept
   {
     return encode_field_number_base<typename T::value_type>(field_number);
   }
 
-  uint32_t read_field_size(const uint8_t *end);
+  uint32_t read_field_size(const uint8_t* end);
 
-  template <typename T, typename std::enable_if_t<std::is_integral<T>::value> * = nullptr>
-  void read_field(T &value, const uint8_t *end)
+  template <typename T, typename std::enable_if_t<std::is_integral<T>::value>* = nullptr>
+  void read_field(T& value, const uint8_t* end)
   {
     value = get<T>();
   }
 
-  template <typename T, typename std::enable_if_t<std::is_enum<T>::value> * = nullptr>
-  void read_field(T &value, const uint8_t *end)
+  template <typename T, typename std::enable_if_t<std::is_enum<T>::value>* = nullptr>
+  void read_field(T& value, const uint8_t* end)
   {
     value = static_cast<T>(get<uint32_t>());
   }
 
-  template <typename T, typename std::enable_if_t<std::is_same<T, std::string>::value> * = nullptr>
-  void read_field(T &value, const uint8_t *end)
+  template <typename T, typename std::enable_if_t<std::is_same<T, std::string>::value>* = nullptr>
+  void read_field(T& value, const uint8_t* end)
   {
     auto const size = read_field_size(end);
-    value.assign(reinterpret_cast<const char *>(m_cur), size);
+    value.assign(reinterpret_cast<const char*>(m_cur), size);
     m_cur += size;
   }
 
   template <typename T,
-            typename std::enable_if_t<std::is_same<T, std::vector<std::string>>::value> * = nullptr>
-  void read_field(T &value, const uint8_t *end)
+            typename std::enable_if_t<std::is_same<T, std::vector<std::string>>::value>* = nullptr>
+  void read_field(T& value, const uint8_t* end)
   {
     auto const size = read_field_size(end);
-    value.emplace_back(reinterpret_cast<const char *>(m_cur), size);
+    value.emplace_back(reinterpret_cast<const char*>(m_cur), size);
     m_cur += size;
   }
 
-  template <typename T,
-            typename std::enable_if_t<
-              std::is_same<T, std::vector<typename T::value_type>>::value and
-              !std::is_same<std::string, typename T::value_type>::value> * = nullptr>
-  void read_field(T &value, const uint8_t *end)
+  template <
+    typename T,
+    typename std::enable_if_t<std::is_same<T, std::vector<typename T::value_type>>::value and
+                              !std::is_same<std::string, typename T::value_type>::value>* = nullptr>
+  void read_field(T& value, const uint8_t* end)
   {
     auto const size = read_field_size(end);
     value.emplace_back();
@@ -279,9 +279,9 @@ class ProtobufReader {
   }
 
   template <typename T,
-            typename std::enable_if_t<std::is_same<T, std::optional<typename T::value_type>>::value>
-              * = nullptr>
-  void read_field(T &value, const uint8_t *end)
+            typename std::enable_if_t<
+              std::is_same<T, std::optional<typename T::value_type>>::value>* = nullptr>
+  void read_field(T& value, const uint8_t* end)
   {
     typename T::value_type contained_value;
     read_field(contained_value, end);
@@ -289,29 +289,30 @@ class ProtobufReader {
   }
 
   template <typename T>
-  auto read_field(T &value, const uint8_t *end) -> decltype(read(value, 0))
+  auto read_field(T& value, const uint8_t* end) -> decltype(read(value, 0))
   {
     auto const size = read_field_size(end);
     read(value, size);
   }
 
-  template <typename T, typename std::enable_if_t<std::is_floating_point<T>::value> * = nullptr>
-  void read_field(T &value, const uint8_t *end)
+  template <typename T, typename std::enable_if_t<std::is_floating_point<T>::value>* = nullptr>
+  void read_field(T& value, const uint8_t* end)
   {
     memcpy(&value, m_cur, sizeof(T));
     m_cur += sizeof(T);
   }
 
   template <typename T>
-  void read_packed_field(T &value, const uint8_t *end)
+  void read_packed_field(T& value, const uint8_t* end)
   {
     auto const len       = get<uint32_t>();
     auto const field_end = std::min(m_cur + len, end);
-    while (m_cur < field_end) value.push_back(get<typename T::value_type>());
+    while (m_cur < field_end)
+      value.push_back(get<typename T::value_type>());
   }
 
   template <typename T>
-  void read_raw_field(T &value, const uint8_t *end)
+  void read_raw_field(T& value, const uint8_t* end)
   {
     auto const size = read_field_size(end);
     value.emplace_back(m_cur, m_cur + size);
@@ -321,14 +322,14 @@ class ProtobufReader {
   template <typename T>
   struct field_reader {
     int const encoded_field_number;
-    T &output_value;
+    T& output_value;
 
-    field_reader(int field_number, T &field_value)
+    field_reader(int field_number, T& field_value)
       : encoded_field_number(encode_field_number<T>(field_number)), output_value(field_value)
     {
     }
 
-    inline void operator()(ProtobufReader *pbr, const uint8_t *end)
+    inline void operator()(ProtobufReader* pbr, const uint8_t* end)
     {
       pbr->read_field(output_value, end);
     }
@@ -337,14 +338,14 @@ class ProtobufReader {
   template <typename T>
   struct packed_field_reader {
     int const encoded_field_number;
-    T &output_value;
+    T& output_value;
 
-    packed_field_reader(int field_number, T &field_value)
+    packed_field_reader(int field_number, T& field_value)
       : encoded_field_number(encode_field_number<T>(field_number)), output_value(field_value)
     {
     }
 
-    inline void operator()(ProtobufReader *pbr, const uint8_t *end)
+    inline void operator()(ProtobufReader* pbr, const uint8_t* end)
     {
       pbr->read_packed_field(output_value, end);
     }
@@ -353,22 +354,22 @@ class ProtobufReader {
   template <typename T>
   struct raw_field_reader {
     int const encoded_field_number;
-    T &output_value;
+    T& output_value;
 
-    raw_field_reader(int field_number, T &field_value)
+    raw_field_reader(int field_number, T& field_value)
       : encoded_field_number(encode_field_number<T>(field_number)), output_value(field_value)
     {
     }
 
-    inline void operator()(ProtobufReader *pbr, const uint8_t *end)
+    inline void operator()(ProtobufReader* pbr, const uint8_t* end)
     {
       pbr->read_raw_field(output_value, end);
     }
   };
 
-  const uint8_t *const m_base;
-  const uint8_t *m_cur;
-  const uint8_t *const m_end;
+  const uint8_t* const m_base;
+  const uint8_t* m_cur;
+  const uint8_t* const m_end;
 
  public:
   /**
@@ -381,7 +382,7 @@ class ProtobufReader {
    * @return the field reader object of the right type
    */
   template <typename T>
-  static auto make_field_reader(int field_number, T &field_value)
+  static auto make_field_reader(int field_number, T& field_value)
   {
     return field_reader<T>(field_number, field_value);
   }
@@ -395,7 +396,7 @@ class ProtobufReader {
    * @return the packed field reader object of the right type
    */
   template <typename T>
-  static auto make_packed_field_reader(int field_number, T &field_value)
+  static auto make_packed_field_reader(int field_number, T& field_value)
   {
     return packed_field_reader<T>(field_number, field_value);
   }
@@ -410,7 +411,7 @@ class ProtobufReader {
    * @return the raw field reader object of the right type
    */
   template <typename T>
-  static auto make_raw_field_reader(int field_number, T &field_value)
+  static auto make_raw_field_reader(int field_number, T& field_value)
   {
     return raw_field_reader<T>(field_number, field_value);
   }
@@ -469,7 +470,7 @@ inline int64_t ProtobufReader::get<int64_t>()
 class ProtobufWriter {
  public:
   ProtobufWriter() { m_buf = nullptr; }
-  ProtobufWriter(std::vector<uint8_t> *output) { m_buf = output; }
+  ProtobufWriter(std::vector<uint8_t>* output) { m_buf = output; }
   void putb(uint8_t v) { m_buf->push_back(v); }
   uint32_t put_uint(uint64_t v)
   {
@@ -496,19 +497,19 @@ class ProtobufWriter {
                            TypeKind kind);
 
  public:
-  size_t write(const PostScript &);
-  size_t write(const FileFooter &);
-  size_t write(const StripeInformation &);
-  size_t write(const SchemaType &);
-  size_t write(const UserMetadataItem &);
-  size_t write(const StripeFooter &);
-  size_t write(const Stream &);
-  size_t write(const ColumnEncoding &);
-  size_t write(const StripeStatistics &);
-  size_t write(const Metadata &);
+  size_t write(const PostScript&);
+  size_t write(const FileFooter&);
+  size_t write(const StripeInformation&);
+  size_t write(const SchemaType&);
+  size_t write(const UserMetadataItem&);
+  size_t write(const StripeFooter&);
+  size_t write(const Stream&);
+  size_t write(const ColumnEncoding&);
+  size_t write(const StripeStatistics&);
+  size_t write(const Metadata&);
 
  protected:
-  std::vector<uint8_t> *m_buf;
+  std::vector<uint8_t>* m_buf;
   struct ProtobufFieldWriter;
 };
 
@@ -519,7 +520,7 @@ class ProtobufWriter {
 class OrcDecompressor {
  public:
   OrcDecompressor(CompressionKind kind, uint32_t blockSize);
-  const uint8_t *Decompress(const uint8_t *srcBytes, size_t srcLen, size_t *dstLen);
+  const uint8_t* Decompress(const uint8_t* srcBytes, size_t srcLen, size_t* dstLen);
   uint32_t GetLog2MaxCompressionRatio() const { return m_log2MaxRatio; }
   uint32_t GetMaxUncompressedBlockSize(uint32_t block_len) const
   {
@@ -537,12 +538,38 @@ class OrcDecompressor {
   std::vector<uint8_t> m_buf;
 };
 
+/**
+ * @brief Stores orc id for each column and its adjacent number of children
+ * in case of struct or number of children in case of list column.
+ * If list column has struct column, then all child columns of that struct are treated as child
+ * column of list.
+ *
+ * @code{.pseudo}
+ * Consider following data where a struct has two members and a list column
+ * {"struct": [{"a": 1, "b": 2}, {"a":3, "b":5}], "list":[[1, 2], [2, 3]]}
+ *
+ * `orc_column_meta` for struct column would be
+ * id = 0
+ * num_children = 2
+ *
+ * `orc_column_meta` for list column would be
+ * id = 3
+ * num_children = 1
+ * @endcode
+ *
+ */
+struct orc_column_meta {
+  // orc_column_meta(uint32_t _id, uint32_t _num_children) : id(_id), num_children(_num_children){};
+  uint32_t id;            // orc id for the column
+  uint32_t num_children;  // number of children at the same level of nesting in case of struct
+};
+
 /**
  * @brief A helper class for ORC file metadata. Provides some additional
  * convenience methods for initializing and accessing metadata.
  */
 class metadata {
-  using OrcStripeInfo = std::pair<const StripeInformation *, const StripeFooter *>;
+  using OrcStripeInfo = std::pair<const StripeInformation*, const StripeFooter*>;
 
  public:
   struct stripe_source_mapping {
@@ -551,12 +578,12 @@ class metadata {
   };
 
  public:
-  explicit metadata(datasource *const src);
+  explicit metadata(datasource* const src);
 
   size_t get_total_rows() const { return ff.numberOfRows; }
   int get_num_stripes() const { return ff.stripes.size(); }
   int get_num_columns() const { return ff.types.size(); }
-  std::string const &get_column_name(int32_t column_id) const
+  std::string const& get_column_name(int32_t column_id) const
   {
     if (column_names.empty() && get_num_columns() != 0) { init_column_names(); }
     return column_names[column_id];
@@ -569,7 +596,7 @@ class metadata {
   Metadata md;
   std::vector<StripeFooter> stripefooters;
   std::unique_ptr<OrcDecompressor> decompressor;
-  datasource *const source;
+  datasource* const source;
 
  private:
   struct schema_indexes {
diff --git a/cpp/src/io/orc/orc_field_reader.hpp b/cpp/src/io/orc/orc_field_reader.hpp
index 8e9bca44340..45d2cbe3bf2 100644
--- a/cpp/src/io/orc/orc_field_reader.hpp
+++ b/cpp/src/io/orc/orc_field_reader.hpp
@@ -41,10 +41,10 @@ namespace orc {
 template <int index>
 struct FunctionSwitchImpl {
   template <typename... Operator>
-  static inline void run(ProtobufReader *pbr,
-                         const uint8_t *end,
-                         const int &encoded_field_number,
-                         std::tuple<Operator...> &ops)
+  static inline void run(ProtobufReader* pbr,
+                         const uint8_t* end,
+                         const int& encoded_field_number,
+                         std::tuple<Operator...>& ops)
   {
     if (encoded_field_number == std::get<index>(ops).encoded_field_number) {
       std::get<index>(ops)(pbr, end);
@@ -57,10 +57,10 @@ struct FunctionSwitchImpl {
 template <>
 struct FunctionSwitchImpl<0> {
   template <typename... Operator>
-  static inline void run(ProtobufReader *pbr,
-                         const uint8_t *end,
-                         const int &encoded_field_number,
-                         std::tuple<Operator...> &ops)
+  static inline void run(ProtobufReader* pbr,
+                         const uint8_t* end,
+                         const int& encoded_field_number,
+                         std::tuple<Operator...>& ops)
   {
     if (encoded_field_number == std::get<0>(ops).encoded_field_number) {
       std::get<0>(ops)(pbr, end);
@@ -78,10 +78,10 @@ struct FunctionSwitchImpl<0> {
  * pointed to by the functors.
  */
 template <typename T, typename... Operator>
-inline void ProtobufReader::function_builder(T &s, size_t maxlen, std::tuple<Operator...> &op)
+inline void ProtobufReader::function_builder(T& s, size_t maxlen, std::tuple<Operator...>& op)
 {
   constexpr int index = std::tuple_size<std::tuple<Operator...>>::value - 1;
-  auto *const end     = std::min(m_cur + maxlen, m_end);
+  auto* const end     = std::min(m_cur + maxlen, m_end);
   while (m_cur < end) {
     auto const field = get<uint32_t>();
     FunctionSwitchImpl<index>::run(this, end, field, op);
diff --git a/cpp/src/io/orc/orc_field_writer.hpp b/cpp/src/io/orc/orc_field_writer.hpp
index 13c7befa3a1..7882810b50d 100644
--- a/cpp/src/io/orc/orc_field_writer.hpp
+++ b/cpp/src/io/orc/orc_field_writer.hpp
@@ -31,15 +31,15 @@ namespace orc {
 
 struct ProtobufWriter::ProtobufFieldWriter {
   int struct_size;
-  ProtobufWriter *p;
+  ProtobufWriter* p;
 
-  ProtobufFieldWriter(ProtobufWriter *pbw) : struct_size(0), p(pbw) {}
+  ProtobufFieldWriter(ProtobufWriter* pbw) : struct_size(0), p(pbw) {}
 
   /**
    * @brief Function to write a unsigned integer to the internal buffer
    */
   template <typename T>
-  void field_uint(int field, const T &value)
+  void field_uint(int field, const T& value)
   {
     struct_size += p->put_uint(field * 8 + PB_TYPE_VARINT);
     struct_size += p->put_uint(static_cast<uint64_t>(value));
@@ -50,7 +50,7 @@ struct ProtobufWriter::ProtobufFieldWriter {
    * buffer
    */
   template <typename T>
-  void field_packed_uint(int field, const std::vector<T> &value)
+  void field_packed_uint(int field, const std::vector<T>& value)
   {
     struct_size += p->put_uint(field * 8 + PB_TYPE_FIXEDLEN);
     auto lpos = p->m_buf->size();
@@ -68,31 +68,33 @@ struct ProtobufWriter::ProtobufFieldWriter {
   /**
    * @brief Function to write a string to the internal buffer
    */
-  void field_string(int field, const std::string &value)
+  void field_string(int field, const std::string& value)
   {
     size_t len = value.length();
     struct_size += p->put_uint(field * 8 + PB_TYPE_FIXEDLEN);
     struct_size += p->put_uint(len) + len;
-    for (size_t i = 0; i < len; i++) p->putb(value[i]);
+    for (size_t i = 0; i < len; i++)
+      p->putb(value[i]);
   }
 
   /**
    * @brief Function to write a blob to the internal buffer
    */
   template <typename T>
-  void field_blob(int field, const std::vector<T> &value)
+  void field_blob(int field, const std::vector<T>& value)
   {
     size_t len = value.size();
     struct_size += p->put_uint(field * 8 + PB_TYPE_FIXEDLEN);
     struct_size += p->put_uint(len) + len;
-    for (size_t i = 0; i < len; i++) p->putb(value[i]);
+    for (size_t i = 0; i < len; i++)
+      p->putb(value[i]);
   }
 
   /**
    * @brief Function to write a struct to the internal buffer
    */
   template <typename T>
-  void field_struct(int field, const T &value)
+  void field_struct(int field, const T& value)
   {
     struct_size += p->put_uint((field)*8 + PB_TYPE_FIXEDLEN);
     auto lpos = p->m_buf->size();
@@ -107,18 +109,20 @@ struct ProtobufWriter::ProtobufFieldWriter {
   /**
    * @brief Function to write a vector of strings to the internal buffer
    */
-  void field_repeated_string(int field, const std::vector<std::string> &value)
+  void field_repeated_string(int field, const std::vector<std::string>& value)
   {
-    for (const auto &elem : value) field_string(field, elem);
+    for (const auto& elem : value)
+      field_string(field, elem);
   }
 
   /**
    * @brief Function to write a vector of structs to the internal buffer
    */
   template <typename T>
-  void field_repeated_struct(int field, const std::vector<T> &value)
+  void field_repeated_struct(int field, const std::vector<T>& value)
   {
-    for (const auto &elem : value) field_struct(field, elem);
+    for (const auto& elem : value)
+      field_struct(field, elem);
   }
 
   /**
@@ -126,9 +130,10 @@ struct ProtobufWriter::ProtobufFieldWriter {
    * buffer
    */
   template <typename T>
-  void field_repeated_struct_blob(int field, const std::vector<T> &value)
+  void field_repeated_struct_blob(int field, const std::vector<T>& value)
   {
-    for (const auto &elem : value) field_blob(field, elem);
+    for (const auto& elem : value)
+      field_blob(field, elem);
   }
 
   /**
diff --git a/cpp/src/io/orc/orc_gpu.h b/cpp/src/io/orc/orc_gpu.h
index 66734df86c0..fa91dd13755 100644
--- a/cpp/src/io/orc/orc_gpu.h
+++ b/cpp/src/io/orc/orc_gpu.h
@@ -37,7 +37,7 @@ using cudf::detail::device_2dspan;
 
 struct CompressedStreamInfo {
   CompressedStreamInfo() = default;
-  explicit constexpr CompressedStreamInfo(const uint8_t *compressed_data_, size_t compressed_size_)
+  explicit constexpr CompressedStreamInfo(const uint8_t* compressed_data_, size_t compressed_size_)
     : compressed_data(compressed_data_),
       uncompressed_data(nullptr),
       compressed_data_size(compressed_size_),
@@ -49,13 +49,13 @@ struct CompressedStreamInfo {
       max_uncompressed_size(0)
   {
   }
-  const uint8_t *compressed_data;  // [in] base ptr to compressed stream data
-  uint8_t *uncompressed_data;  // [in] base ptr to uncompressed stream data or NULL if not known yet
+  const uint8_t* compressed_data;  // [in] base ptr to compressed stream data
+  uint8_t* uncompressed_data;  // [in] base ptr to uncompressed stream data or NULL if not known yet
   size_t compressed_data_size;      // [in] compressed data size for this stream
-  gpu_inflate_input_s *decctl;      // [in] base ptr to decompression structure to be filled
-  gpu_inflate_status_s *decstatus;  // [in] results of decompression
-  gpu_inflate_input_s
-    *copyctl;  // [in] base ptr to copy structure to be filled for uncompressed blocks
+  gpu_inflate_input_s* decctl;      // [in] base ptr to decompression structure to be filled
+  gpu_inflate_status_s* decstatus;  // [in] results of decompression
+  gpu_inflate_input_s*
+    copyctl;  // [in] base ptr to copy structure to be filled for uncompressed blocks
   uint32_t num_compressed_blocks;  // [in,out] number of entries in decctl(in), number of compressed
                                    // blocks(out)
   uint32_t num_uncompressed_blocks;  // [in,out] number of entries in copyctl(in), number of
@@ -89,13 +89,16 @@ constexpr int orc_decimal2float64_scale = 0x80;
  * @brief Struct to describe per stripe's column information
  */
 struct ColumnDesc {
-  const uint8_t *streams[CI_NUM_STREAMS];  // ptr to data stream index
+  const uint8_t* streams[CI_NUM_STREAMS];  // ptr to data stream index
   uint32_t strm_id[CI_NUM_STREAMS];        // stream ids
   uint32_t strm_len[CI_NUM_STREAMS];       // stream length
-  uint32_t *valid_map_base;                // base pointer of valid bit map for this column
-  void *column_data_base;                  // base pointer of column data
+  uint32_t* valid_map_base;                // base pointer of valid bit map for this column
+  void* column_data_base;                  // base pointer of column data
   uint32_t start_row;                      // starting row of the stripe
-  uint32_t num_rows;                       // starting row of the stripe
+  uint32_t num_rows;                       // number of rows in stripe
+  uint32_t column_num_rows;                // number of rows in whole column
+  uint32_t num_child_rows;                 // store number of child rows if it's list column
+  uint32_t num_rowgroups;                  // number of rowgroups in the chunk
   uint32_t dictionary_start;               // start position in global dictionary
   uint32_t dict_len;                       // length of local dictionary
   uint32_t null_count;                     // number of null values in this stripe's column
@@ -115,6 +118,9 @@ struct RowGroup {
   uint32_t chunk_id;        // Column chunk this entry belongs to
   uint32_t strm_offset[2];  // Index offset for CI_DATA and CI_DATA2 streams
   uint16_t run_pos[2];      // Run position for CI_DATA and CI_DATA2
+  uint32_t num_rows;        // number of rows in rowgroup
+  uint32_t start_row;       // starting row of the rowgroup
+  uint32_t num_child_rows;  // number of rows of children in rowgroup in case of list type
 };
 
 /**
@@ -128,16 +134,16 @@ struct EncChunk {
   uint8_t dtype_len;      // data type length
   int32_t scale;          // scale for decimals or timestamps
 
-  uint32_t *dict_index;  // dictionary index from row index
+  uint32_t* dict_index;  // dictionary index from row index
   device_span<uint32_t> decimal_offsets;
-  column_device_view *leaf_column;
+  column_device_view* leaf_column;
 };
 
 /**
  * @brief Struct to describe the streams that correspond to a single `EncChunk`.
  */
 struct encoder_chunk_streams {
-  uint8_t *data_ptrs[CI_NUM_STREAMS];  // encoded output
+  uint8_t* data_ptrs[CI_NUM_STREAMS];  // encoded output
   int32_t ids[CI_NUM_STREAMS];         // stream id; -1 if stream is not present
   uint32_t lengths[CI_NUM_STREAMS];    // in: max length, out: actual length
 };
@@ -160,8 +166,8 @@ struct StripeStream {
  * @brief Struct to describe a dictionary chunk
  */
 struct DictionaryChunk {
-  uint32_t *dict_data;   // dictionary data (index of non-null rows)
-  uint32_t *dict_index;  // row indices of corresponding string (row from dictionary index)
+  uint32_t* dict_data;   // dictionary data (index of non-null rows)
+  uint32_t* dict_index;  // row indices of corresponding string (row from dictionary index)
   uint32_t start_row;    // start row of this chunk
   uint32_t num_rows;     // num rows in this chunk
   uint32_t num_strings;  // number of strings in this chunk
@@ -170,22 +176,22 @@ struct DictionaryChunk {
   uint32_t num_dict_strings;  // number of strings in dictionary
   uint32_t dict_char_count;   // size of dictionary string data for this chunk
 
-  column_device_view *leaf_column;  //!< Pointer to string column
+  column_device_view* leaf_column;  //!< Pointer to string column
 };
 
 /**
  * @brief Struct to describe a dictionary
  */
 struct StripeDictionary {
-  uint32_t *dict_data;       // row indices of corresponding string (row from dictionary index)
-  uint32_t *dict_index;      // dictionary index from row index
+  uint32_t* dict_data;       // row indices of corresponding string (row from dictionary index)
+  uint32_t* dict_index;      // dictionary index from row index
   uint32_t column_id;        // real column id
   uint32_t start_chunk;      // first chunk in stripe
   uint32_t num_chunks;       // number of chunks in the stripe
   uint32_t num_strings;      // number of unique strings in the dictionary
   uint32_t dict_char_count;  // total size of dictionary string data
 
-  column_device_view *leaf_column;  //!< Pointer to string column
+  column_device_view* leaf_column;  //!< Pointer to string column
 };
 
 /**
@@ -198,7 +204,7 @@ struct StripeDictionary {
  *compressed size)
  * @param[in] stream CUDA stream to use, default `rmm::cuda_stream_default`
  */
-void ParseCompressedStripeData(CompressedStreamInfo *strm_info,
+void ParseCompressedStripeData(CompressedStreamInfo* strm_info,
                                int32_t num_streams,
                                uint32_t compression_block_size,
                                uint32_t log2maxcr           = 24,
@@ -211,7 +217,7 @@ void ParseCompressedStripeData(CompressedStreamInfo *strm_info,
  * @param[in] num_streams Number of compressed streams
  * @param[in] stream CUDA stream to use, default `rmm::cuda_stream_default`
  */
-void PostDecompressionReassemble(CompressedStreamInfo *strm_info,
+void PostDecompressionReassemble(CompressedStreamInfo* strm_info,
                                  int32_t num_streams,
                                  rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
@@ -224,15 +230,19 @@ void PostDecompressionReassemble(CompressedStreamInfo *strm_info,
  * @param[in] num_columns Number of columns
  * @param[in] num_stripes Number of stripes
  * @param[in] num_rowgroups Number of row groups
+ * @param[in] rowidx_stride Row index stride
+ * @param[in] use_base_stride Whether to use base stride obtained from meta or use the computed
+ * value
  * @param[in] stream CUDA stream to use, default `rmm::cuda_stream_default`
  */
-void ParseRowGroupIndex(RowGroup *row_groups,
-                        CompressedStreamInfo *strm_info,
-                        ColumnDesc *chunks,
+void ParseRowGroupIndex(RowGroup* row_groups,
+                        CompressedStreamInfo* strm_info,
+                        ColumnDesc* chunks,
                         uint32_t num_columns,
                         uint32_t num_stripes,
                         uint32_t num_rowgroups,
                         uint32_t rowidx_stride,
+                        bool use_base_stride,
                         rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
 /**
@@ -242,15 +252,13 @@ void ParseRowGroupIndex(RowGroup *row_groups,
  * @param[in] global_dictionary Global dictionary device array
  * @param[in] num_columns Number of columns
  * @param[in] num_stripes Number of stripes
- * @param[in] max_rows Maximum number of rows to load
  * @param[in] first_row Crop all rows below first_row
  * @param[in] stream CUDA stream to use, default `rmm::cuda_stream_default`
  */
-void DecodeNullsAndStringDictionaries(ColumnDesc *chunks,
-                                      DictionaryEntry *global_dictionary,
+void DecodeNullsAndStringDictionaries(ColumnDesc* chunks,
+                                      DictionaryEntry* global_dictionary,
                                       uint32_t num_columns,
                                       uint32_t num_stripes,
-                                      size_t max_rows              = ~0,
                                       size_t first_row             = 0,
                                       rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
@@ -261,25 +269,25 @@ void DecodeNullsAndStringDictionaries(ColumnDesc *chunks,
  * @param[in] global_dictionary Global dictionary device array
  * @param[in] num_columns Number of columns
  * @param[in] num_stripes Number of stripes
- * @param[in] max_rows Maximum number of rows to load
  * @param[in] first_row Crop all rows below first_row
  * @param[in] tz_table Timezone translation table
  * @param[in] tz_len Length of timezone translation table
- * @param[in] row_groups Optional row index data
+ * @param[in] row_groups Optional row index data [rowgroup][column]
  * @param[in] num_rowgroups Number of row groups in row index data
  * @param[in] rowidx_stride Row index stride
+ * @param[in] level Current nesting level being processed
  * @param[in] stream CUDA stream to use, default `rmm::cuda_stream_default`
  */
-void DecodeOrcColumnData(ColumnDesc const *chunks,
-                         DictionaryEntry *global_dictionary,
+void DecodeOrcColumnData(ColumnDesc* chunks,
+                         DictionaryEntry* global_dictionary,
+                         device_2dspan<RowGroup> row_groups,
                          uint32_t num_columns,
                          uint32_t num_stripes,
-                         size_t max_rows              = ~0,
                          size_t first_row             = 0,
                          timezone_table_view tz_table = {},
-                         const RowGroup *row_groups   = 0,
                          uint32_t num_rowgroups       = 0,
                          uint32_t rowidx_stride       = 0,
+                         size_t level                 = 0,
                          rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
 /**
@@ -303,7 +311,7 @@ void EncodeOrcColumnData(device_2dspan<EncChunk const> chunks,
  * @param[in,out] enc_streams chunk streams device array [column][rowgroup]
  * @param[in] stream CUDA stream to use, default `rmm::cuda_stream_default`
  */
-void EncodeStripeDictionaries(StripeDictionary *stripes,
+void EncodeStripeDictionaries(StripeDictionary* stripes,
                               device_2dspan<EncChunk const> chunks,
                               uint32_t num_string_columns,
                               uint32_t num_stripes,
@@ -317,7 +325,7 @@ void EncodeStripeDictionaries(StripeDictionary *stripes,
  * @param[in,out] chunks encoder chunk device array [column][rowgroup]
  * @param[in] stream CUDA stream to use, default `rmm::cuda_stream_default`
  */
-void set_chunk_columns(const table_device_view &view,
+void set_chunk_columns(const table_device_view& view,
                        device_2dspan<EncChunk> chunks,
                        rmm::cuda_stream_view stream);
 
@@ -345,14 +353,14 @@ void CompactOrcDataStreams(device_2dspan<StripeStream> strm_desc,
  * @param[out] comp_in Per-block compression input parameters
  * @param[out] comp_out Per-block compression status
  */
-void CompressOrcDataStreams(uint8_t *compressed_data,
+void CompressOrcDataStreams(uint8_t* compressed_data,
                             uint32_t num_compressed_blocks,
                             CompressionKind compression,
                             uint32_t comp_blk_size,
                             device_2dspan<StripeStream> strm_desc,
                             device_2dspan<encoder_chunk_streams> enc_streams,
-                            gpu_inflate_input_s *comp_in,
-                            gpu_inflate_status_s *comp_out,
+                            gpu_inflate_input_s* comp_in,
+                            gpu_inflate_status_s* comp_out,
                             rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
 /**
@@ -368,12 +376,12 @@ void CompressOrcDataStreams(uint8_t *compressed_data,
  * @param[in] num_rowgroups Number of row groups
  * @param[in] stream CUDA stream to use, default `rmm::cuda_stream_default`
  */
-void InitDictionaryIndices(const table_device_view &view,
-                           DictionaryChunk *chunks,
-                           uint32_t *dict_data,
-                           uint32_t *dict_index,
+void InitDictionaryIndices(const table_device_view& view,
+                           DictionaryChunk* chunks,
+                           uint32_t* dict_data,
+                           uint32_t* dict_index,
                            size_t row_index_stride,
-                           size_type *str_col_ids,
+                           size_type* str_col_ids,
                            uint32_t num_columns,
                            uint32_t num_rowgroups,
                            rmm::cuda_stream_view stream);
@@ -389,9 +397,9 @@ void InitDictionaryIndices(const table_device_view &view,
  * @param[in] num_columns Number of columns
  * @param[in] stream CUDA stream to use, default `rmm::cuda_stream_default`
  */
-void BuildStripeDictionaries(StripeDictionary *stripes_dev,
-                             StripeDictionary *stripes_host,
-                             DictionaryChunk const *chunks,
+void BuildStripeDictionaries(StripeDictionary* stripes_dev,
+                             StripeDictionary* stripes_host,
+                             DictionaryChunk const* chunks,
                              uint32_t num_stripes,
                              uint32_t num_rowgroups,
                              uint32_t num_columns,
@@ -407,8 +415,8 @@ void BuildStripeDictionaries(StripeDictionary *stripes_dev,
  * @param[in] row_index_stride Rowgroup size in rows
  * @param[in] stream CUDA stream to use, default `rmm::cuda_stream_default`
  */
-void orc_init_statistics_groups(statistics_group *groups,
-                                const stats_column_desc *cols,
+void orc_init_statistics_groups(statistics_group* groups,
+                                const stats_column_desc* cols,
                                 uint32_t num_columns,
                                 uint32_t num_rowgroups,
                                 uint32_t row_index_stride,
@@ -422,8 +430,8 @@ void orc_init_statistics_groups(statistics_group *groups,
  * @param[in] statistics_count Number of statistics buffers to encode
  * @param[in] stream CUDA stream to use, default `rmm::cuda_stream_default`
  */
-void orc_init_statistics_buffersize(statistics_merge_group *groups,
-                                    const statistics_chunk *chunks,
+void orc_init_statistics_buffersize(statistics_merge_group* groups,
+                                    const statistics_chunk* chunks,
                                     uint32_t statistics_count,
                                     rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
@@ -435,9 +443,9 @@ void orc_init_statistics_buffersize(statistics_merge_group *groups,
  * @param[in,out] chunks Statistics data
  * @param[in] statistics_count Number of statistics buffers
  */
-void orc_encode_statistics(uint8_t *blob_bfr,
-                           statistics_merge_group *groups,
-                           const statistics_chunk *chunks,
+void orc_encode_statistics(uint8_t* blob_bfr,
+                           statistics_merge_group* groups,
+                           const statistics_chunk* chunks,
                            uint32_t statistics_count,
                            rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 499cb3f0432..b2b4538994e 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -26,6 +26,7 @@
 #include <io/comp/gpuinflate.h>
 #include "orc.h"
 
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
@@ -34,6 +35,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
 
 #include <algorithm>
 #include <array>
@@ -50,7 +52,7 @@ namespace {
 /**
  * @brief Function that translates ORC data kind to cuDF type enum
  */
-constexpr type_id to_type_id(const orc::SchemaType &schema,
+constexpr type_id to_type_id(const orc::SchemaType& schema,
                              bool use_np_dtypes,
                              type_id timestamp_type_id,
                              bool decimals_as_float64)
@@ -76,6 +78,8 @@ constexpr type_id to_type_id(const orc::SchemaType &schema,
       // There isn't a (DAYS -> np.dtype) mapping
       return (use_np_dtypes) ? type_id::TIMESTAMP_MILLISECONDS : type_id::TIMESTAMP_DAYS;
     case orc::DECIMAL: return (decimals_as_float64) ? type_id::FLOAT64 : type_id::DECIMAL64;
+    case orc::LIST: return type_id::LIST;
+    case orc::STRUCT: return type_id::STRUCT;
     default: break;
   }
 
@@ -123,6 +127,26 @@ constexpr std::pair<gpu::StreamIndexType, uint32_t> get_index_type_and_pos(
 }  // namespace
 
 namespace {
+/**
+ * @brief struct to store buffer data and size of list buffer
+ */
+struct list_buffer_data {
+  size_type* data;
+  size_type size;
+};
+
+// Generates offsets for list buffer from number of elements in a row.
+void generate_offsets_for_list(rmm::device_uvector<list_buffer_data> const& buff_data,
+                               rmm::cuda_stream_view stream)
+{
+  auto transformer = [] __device__(list_buffer_data list_data) {
+    thrust::exclusive_scan(
+      thrust::seq, list_data.data, list_data.data + list_data.size, list_data.data);
+  };
+  thrust::for_each(rmm::exec_policy(stream), buff_data.begin(), buff_data.end(), transformer);
+  stream.synchronize();
+}
+
 /**
  * @brief Struct that maps ORC streams to columns
  */
@@ -148,20 +172,19 @@ struct orc_stream_info {
  * @brief Function that populates column descriptors stream/chunk
  */
 size_t gather_stream_info(const size_t stripe_index,
-                          const orc::StripeInformation *stripeinfo,
-                          const orc::StripeFooter *stripefooter,
-                          const std::vector<int> &orc2gdf,
-                          const std::vector<int> &gdf2orc,
+                          const orc::StripeInformation* stripeinfo,
+                          const orc::StripeFooter* stripefooter,
+                          const std::vector<int>& orc2gdf,
+                          const std::vector<orc_column_meta>& gdf2orc,
                           const std::vector<orc::SchemaType> types,
                           bool use_index,
-                          size_t *num_dictionary_entries,
-                          hostdevice_vector<gpu::ColumnDesc> &chunks,
-                          std::vector<orc_stream_info> &stream_info)
+                          size_t* num_dictionary_entries,
+                          cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks,
+                          std::vector<orc_stream_info>& stream_info)
 {
-  const auto num_columns = gdf2orc.size();
-  uint64_t src_offset    = 0;
-  uint64_t dst_offset    = 0;
-  for (const auto &stream : stripefooter->streams) {
+  uint64_t src_offset = 0;
+  uint64_t dst_offset = 0;
+  for (const auto& stream : stripefooter->streams) {
     if (!stream.column_id || *stream.column_id >= orc2gdf.size()) {
       dst_offset += stream.length;
       continue;
@@ -177,11 +200,11 @@ size_t gather_stream_info(const size_t stripe_index,
       const auto schema_type = types[column_id];
       if (schema_type.subtypes.size() != 0) {
         if (schema_type.kind == orc::STRUCT && stream.kind == orc::PRESENT) {
-          for (const auto &idx : schema_type.subtypes) {
+          for (const auto& idx : schema_type.subtypes) {
             auto child_idx = (idx < orc2gdf.size()) ? orc2gdf[idx] : -1;
             if (child_idx >= 0) {
               col                             = child_idx;
-              auto &chunk                     = chunks[stripe_index * num_columns + col];
+              auto& chunk                     = chunks[stripe_index][col];
               chunk.strm_id[gpu::CI_PRESENT]  = stream_info.size();
               chunk.strm_len[gpu::CI_PRESENT] = stream.length;
             }
@@ -192,7 +215,7 @@ size_t gather_stream_info(const size_t stripe_index,
     if (col != -1) {
       if (src_offset >= stripeinfo->indexLength || use_index) {
         // NOTE: skip_count field is temporarily used to track index ordering
-        auto &chunk = chunks[stripe_index * num_columns + col];
+        auto& chunk = chunks[stripe_index][col];
         const auto idx =
           get_index_type_and_pos(stream.kind, chunk.skip_count, col == orc2gdf[column_id]);
         if (idx.first < gpu::CI_NUM_STREAMS) {
@@ -220,8 +243,8 @@ size_t gather_stream_info(const size_t stripe_index,
 /**
  * @brief Determines if a column should be converted from decimal to float
  */
-bool should_convert_decimal_column_to_float(const std::vector<std::string> &columns_to_convert,
-                                            cudf::io::orc::metadata &metadata,
+bool should_convert_decimal_column_to_float(const std::vector<std::string>& columns_to_convert,
+                                            cudf::io::orc::metadata& metadata,
                                             int column_index)
 {
   return (std::find(columns_to_convert.begin(),
@@ -237,7 +260,7 @@ bool should_convert_decimal_column_to_float(const std::vector<std::string> &colu
  * to aggregate that metadata from all the files.
  */
 class aggregate_orc_metadata {
-  using OrcStripeInfo = std::pair<const StripeInformation *, const StripeFooter *>;
+  using OrcStripeInfo = std::pair<const StripeInformation*, const StripeFooter*>;
 
  public:
   mutable std::vector<cudf::io::orc::metadata> per_file_metadata;
@@ -248,11 +271,11 @@ class aggregate_orc_metadata {
   /**
    * @brief Create a metadata object from each element in the source vector
    */
-  auto metadatas_from_sources(std::vector<std::unique_ptr<datasource>> const &sources)
+  auto metadatas_from_sources(std::vector<std::unique_ptr<datasource>> const& sources)
   {
     std::vector<cudf::io::orc::metadata> metadatas;
     std::transform(
-      sources.cbegin(), sources.cend(), std::back_inserter(metadatas), [](auto const &source) {
+      sources.cbegin(), sources.cend(), std::back_inserter(metadatas), [](auto const& source) {
         return cudf::io::orc::metadata(source.get());
       });
     return metadatas;
@@ -264,7 +287,7 @@ class aggregate_orc_metadata {
   size_type calc_num_rows() const
   {
     return std::accumulate(
-      per_file_metadata.begin(), per_file_metadata.end(), 0, [](auto &sum, auto &pfm) {
+      per_file_metadata.begin(), per_file_metadata.end(), 0, [](auto& sum, auto& pfm) {
         return sum + pfm.get_total_rows();
       });
   }
@@ -284,12 +307,12 @@ class aggregate_orc_metadata {
   size_type calc_num_stripes() const
   {
     return std::accumulate(
-      per_file_metadata.begin(), per_file_metadata.end(), 0, [](auto &sum, auto &pfm) {
+      per_file_metadata.begin(), per_file_metadata.end(), 0, [](auto& sum, auto& pfm) {
         return sum + pfm.get_num_stripes();
       });
   }
 
-  aggregate_orc_metadata(std::vector<std::unique_ptr<datasource>> const &sources)
+  aggregate_orc_metadata(std::vector<std::unique_ptr<datasource>> const& sources)
     : per_file_metadata(metadatas_from_sources(sources)),
       num_rows(calc_num_rows()),
       num_columns(calc_num_cols()),
@@ -297,7 +320,7 @@ class aggregate_orc_metadata {
   {
     // Verify that the input files have the same number of columns,
     // as well as matching types, compression, and names
-    for (auto const &pfm : per_file_metadata) {
+    for (auto const& pfm : per_file_metadata) {
       CUDF_EXPECTS(per_file_metadata[0].get_num_columns() == pfm.get_num_columns(),
                    "All sources must have the same number of columns");
       CUDF_EXPECTS(per_file_metadata[0].ps.compression == pfm.ps.compression,
@@ -318,7 +341,7 @@ class aggregate_orc_metadata {
     }
   }
 
-  auto const &get_schema(int schema_idx) const { return per_file_metadata[0].ff.types[schema_idx]; }
+  auto const& get_schema(int schema_idx) const { return per_file_metadata[0].ff.types[schema_idx]; }
 
   auto get_col_type(int col_idx) const { return per_file_metadata[0].ff.types[col_idx]; }
 
@@ -330,7 +353,7 @@ class aggregate_orc_metadata {
 
   auto get_num_source_files() const { return per_file_metadata.size(); }
 
-  auto const &get_types() const { return per_file_metadata[0].ff.types; }
+  auto const& get_types() const { return per_file_metadata[0].ff.types; }
 
   int get_row_index_stride() const { return per_file_metadata[0].ff.rowIndexStride; }
 
@@ -344,9 +367,9 @@ class aggregate_orc_metadata {
   }
 
   std::vector<cudf::io::orc::metadata::stripe_source_mapping> select_stripes(
-    std::vector<std::vector<size_type>> const &user_specified_stripes,
-    size_type &row_start,
-    size_type &row_count)
+    std::vector<std::vector<size_type>> const& user_specified_stripes,
+    size_type& row_start,
+    size_type& row_count)
   {
     std::vector<cudf::io::orc::metadata::stripe_source_mapping> selected_stripes_mapping;
 
@@ -365,7 +388,7 @@ class aggregate_orc_metadata {
 
         // Coalesce stripe info at the source file later since that makes downstream processing much
         // easier in impl::read
-        for (const size_t &stripe_idx : user_specified_stripes[src_file_idx]) {
+        for (const size_t& stripe_idx : user_specified_stripes[src_file_idx]) {
           CUDF_EXPECTS(stripe_idx < per_file_metadata[src_file_idx].ff.stripes.size(),
                        "Invalid stripe index");
           stripe_infos.push_back(
@@ -384,7 +407,8 @@ class aggregate_orc_metadata {
       CUDF_EXPECTS(row_count >= 0, "Invalid row count");
       CUDF_EXPECTS(row_start <= get_num_rows(), "Invalid row start");
 
-      size_type count = 0;
+      size_type count            = 0;
+      size_type stripe_skip_rows = 0;
       // Iterate all source files, each source file has corelating metadata
       for (size_t src_file_idx = 0;
            src_file_idx < per_file_metadata.size() && count < row_start + row_count;
@@ -399,16 +423,20 @@ class aggregate_orc_metadata {
           if (count > row_start || count == 0) {
             stripe_infos.push_back(
               std::make_pair(&per_file_metadata[src_file_idx].ff.stripes[stripe_idx], nullptr));
+          } else {
+            stripe_skip_rows = count;
           }
         }
 
         selected_stripes_mapping.push_back({static_cast<int>(src_file_idx), stripe_infos});
       }
+      // Need to remove skipped rows from the stripes which are not selected.
+      row_start -= stripe_skip_rows;
     }
 
     // Read each stripe's stripefooter metadata
     if (not selected_stripes_mapping.empty()) {
-      for (auto &mapping : selected_stripes_mapping) {
+      for (auto& mapping : selected_stripes_mapping) {
         // Resize to all stripe_info for the source level
         per_file_metadata[mapping.source_idx].stripefooters.resize(mapping.stripe_info.size());
 
@@ -434,66 +462,121 @@ class aggregate_orc_metadata {
     return selected_stripes_mapping;
   }
 
+  /**
+   * @brief Adds column as per the request and saves metadata about children.
+   *        Struct children are in the same level as struct, only list column
+   *        children are pushed to next level.
+   *
+   * @param selection A vector that saves list of columns as per levels of nesting.
+   * @param types A vector of schema types of columns.
+   * @param level current level of nesting.
+   * @param id current column id that needs to be added.
+   * @param has_timestamp_column True if timestamp column present and false otherwise.
+   *
+   * @return returns number of child columns at same level in case of struct and next level in case
+   * of list
+   */
+  uint32_t add_column(std::vector<std::vector<orc_column_meta>>& selection,
+                      std::vector<SchemaType> const& types,
+                      const size_t level,
+                      const uint32_t id,
+                      bool& has_timestamp_column,
+                      bool& has_list_column)
+  {
+    uint32_t num_lvl_child_columns = 0;
+    if (level == selection.size()) { selection.emplace_back(); }
+    selection[level].push_back({id, 0});
+    const int col_id = selection[level].size() - 1;
+    if (types[id].kind == orc::TIMESTAMP) { has_timestamp_column = true; }
+
+    switch (types[id].kind) {
+      case orc::LIST: {
+        uint32_t lvl_cols = 0;
+        if (not types[id].subtypes.empty()) {
+          has_list_column = true;
+          // Since list column needs to be processed before its child can be processed,
+          // child column is being added to next level
+          lvl_cols =
+            add_column(selection, types, level + 1, id + 1, has_timestamp_column, has_list_column);
+        }
+        // The list child column may be a struct in which case lvl_cols will be > 1
+        selection[level][col_id].num_children = lvl_cols;
+      } break;
+
+      case orc::STRUCT:
+        for (const auto child_id : types[id].subtypes) {
+          num_lvl_child_columns +=
+            add_column(selection, types, level, child_id, has_timestamp_column, has_list_column);
+        }
+        selection[level][col_id].num_children = num_lvl_child_columns;
+        break;
+
+      default: break;
+    }
+
+    return num_lvl_child_columns + 1;
+  }
+
   /**
    * @brief Filters and reduces down to a selection of columns
    *
    * @param use_names List of column names to select
    * @param has_timestamp_column True if timestamp column present and false otherwise
    *
-   * @return input column information, output column information, list of output column schema
-   * indices
+   * @return Vector of list of ORC column meta-data
    */
-  std::vector<int> select_columns(std::vector<std::string> const &use_names,
-                                  bool &has_timestamp_column) const
+  std::vector<std::vector<orc_column_meta>> select_columns(
+    std::vector<std::string> const& use_names, bool& has_timestamp_column, bool& has_list_column)
   {
-    auto const &pfm = per_file_metadata[0];
+    auto const& pfm = per_file_metadata[0];
+    std::vector<std::vector<orc_column_meta>> selection;
 
-    std::vector<int> output_column_schema_idxs;
     if (not use_names.empty()) {
-      int index = 0;
-      for (auto const &use_name : use_names) {
+      uint32_t index = 0;
+      // Have to check only parent columns
+      auto const num_columns = pfm.ff.types[0].subtypes.size();
+
+      for (const auto& use_name : use_names) {
         bool name_found = false;
-        for (int i = 0; i < pfm.get_num_columns(); ++i, ++index) {
-          if (index >= pfm.get_num_columns()) { index = 0; }
-          if (pfm.get_column_name(index).compare(use_name) == 0) {
+        for (uint32_t i = 0; i < num_columns; ++i, ++index) {
+          if (index >= num_columns) { index = 0; }
+          auto col_id = pfm.ff.types[0].subtypes[index];
+          if (pfm.get_column_name(col_id) == use_name) {
             name_found = true;
-            output_column_schema_idxs.emplace_back(index);
-            if (pfm.ff.types[index].kind == orc::TIMESTAMP) { has_timestamp_column = true; }
-            index++;
+            add_column(selection, pfm.ff.types, 0, col_id, has_timestamp_column, has_list_column);
+            // Should start with next index
+            index = i + 1;
             break;
           }
         }
         CUDF_EXPECTS(name_found, "Unknown column name : " + std::string(use_name));
       }
     } else {
-      // For now, only select all leaf nodes
-      for (int i = 1; i < pfm.get_num_columns(); ++i) {
-        if (pfm.ff.types[i].subtypes.empty()) {
-          output_column_schema_idxs.emplace_back(i);
-          if (pfm.ff.types[i].kind == orc::TIMESTAMP) { has_timestamp_column = true; }
-        }
+      for (auto const& col_id : pfm.ff.types[0].subtypes) {
+        add_column(selection, pfm.ff.types, 0, col_id, has_timestamp_column, has_list_column);
       }
     }
 
-    return output_column_schema_idxs;
+    return selection;
   }
 };
 
 rmm::device_buffer reader::impl::decompress_stripe_data(
-  hostdevice_vector<gpu::ColumnDesc> &chunks,
-  const std::vector<rmm::device_buffer> &stripe_data,
-  const OrcDecompressor *decompressor,
-  std::vector<orc_stream_info> &stream_info,
+  cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks,
+  const std::vector<rmm::device_buffer>& stripe_data,
+  const OrcDecompressor* decompressor,
+  std::vector<orc_stream_info>& stream_info,
   size_t num_stripes,
-  device_span<gpu::RowGroup> row_groups,
+  cudf::detail::hostdevice_2dvector<gpu::RowGroup>& row_groups,
   size_t row_index_stride,
+  bool use_base_stride,
   rmm::cuda_stream_view stream)
 {
   // Parse the columns' compressed info
   hostdevice_vector<gpu::CompressedStreamInfo> compinfo(0, stream_info.size(), stream);
-  for (const auto &info : stream_info) {
+  for (const auto& info : stream_info) {
     compinfo.insert(gpu::CompressedStreamInfo(
-      static_cast<const uint8_t *>(stripe_data[info.stripe_idx].data()) + info.dst_pos,
+      static_cast<const uint8_t*>(stripe_data[info.stripe_idx].data()) + info.dst_pos,
       info.length));
   }
   compinfo.host_to_device(stream);
@@ -525,7 +608,7 @@ rmm::device_buffer reader::impl::decompress_stripe_data(
   uint32_t start_pos        = 0;
   uint32_t start_pos_uncomp = (uint32_t)num_compressed_blocks;
   for (size_t i = 0; i < compinfo.size(); ++i) {
-    auto dst_base                 = static_cast<uint8_t *>(decomp_data.data());
+    auto dst_base                 = static_cast<uint8_t*>(decomp_data.data());
     compinfo[i].uncompressed_data = dst_base + decomp_offset;
     compinfo[i].decctl            = inflate_in.data() + start_pos;
     compinfo[i].decstatus         = inflate_out.data() + start_pos;
@@ -569,11 +652,11 @@ rmm::device_buffer reader::impl::decompress_stripe_data(
   // decompression failed.
   compinfo.device_to_host(stream, true);
 
-  const size_t num_columns = chunks.size() / num_stripes;
+  const size_t num_columns = chunks.size().second;
 
   for (size_t i = 0; i < num_stripes; ++i) {
     for (size_t j = 0; j < num_columns; ++j) {
-      auto &chunk = chunks[i * num_columns + j];
+      auto& chunk = chunks[i][j];
       for (int k = 0; k < gpu::CI_NUM_STREAMS; ++k) {
         if (chunk.strm_len[k] > 0 && chunk.strm_id[k] < compinfo.size()) {
           chunk.streams[k]  = compinfo[chunk.strm_id[k]].uncompressed_data;
@@ -583,38 +666,40 @@ rmm::device_buffer reader::impl::decompress_stripe_data(
     }
   }
 
-  if (not row_groups.empty()) {
+  if (row_groups.size().first) {
     chunks.host_to_device(stream);
-    gpu::ParseRowGroupIndex(row_groups.data(),
+    row_groups.host_to_device(stream);
+    gpu::ParseRowGroupIndex(row_groups.base_device_ptr(),
                             compinfo.device_ptr(),
-                            chunks.device_ptr(),
+                            chunks.base_device_ptr(),
                             num_columns,
                             num_stripes,
-                            row_groups.size() / num_columns,
+                            row_groups.size().first,
                             row_index_stride,
+                            use_base_stride,
                             stream);
   }
 
   return decomp_data;
 }
 
-void reader::impl::decode_stream_data(hostdevice_vector<gpu::ColumnDesc> &chunks,
+void reader::impl::decode_stream_data(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks,
                                       size_t num_dicts,
                                       size_t skip_rows,
-                                      size_t num_rows,
                                       timezone_table_view tz_table,
-                                      device_span<gpu::RowGroup const> row_groups,
+                                      cudf::detail::hostdevice_2dvector<gpu::RowGroup>& row_groups,
                                       size_t row_index_stride,
-                                      std::vector<column_buffer> &out_buffers,
+                                      std::vector<column_buffer>& out_buffers,
+                                      size_t level,
                                       rmm::cuda_stream_view stream)
 {
-  const auto num_columns = out_buffers.size();
-  const auto num_stripes = chunks.size() / out_buffers.size();
+  const auto num_stripes = chunks.size().first;
+  const auto num_columns = chunks.size().second;
 
   // Update chunks with pointers to column data
   for (size_t i = 0; i < num_stripes; ++i) {
     for (size_t j = 0; j < num_columns; ++j) {
-      auto &chunk            = chunks[i * num_columns + j];
+      auto& chunk            = chunks[i][j];
       chunk.column_data_base = out_buffers[j].data();
       chunk.valid_map_base   = out_buffers[j].null_mask();
     }
@@ -625,37 +710,203 @@ void reader::impl::decode_stream_data(hostdevice_vector<gpu::ColumnDesc> &chunks
 
   chunks.host_to_device(stream);
   gpu::DecodeNullsAndStringDictionaries(
-    chunks.device_ptr(), global_dict.data(), num_columns, num_stripes, num_rows, skip_rows, stream);
-  gpu::DecodeOrcColumnData(chunks.device_ptr(),
+    chunks.base_device_ptr(), global_dict.data(), num_columns, num_stripes, skip_rows, stream);
+  gpu::DecodeOrcColumnData(chunks.base_device_ptr(),
                            global_dict.data(),
+                           row_groups,
                            num_columns,
                            num_stripes,
-                           num_rows,
                            skip_rows,
                            tz_table,
-                           row_groups.data(),
-                           row_groups.size() / num_columns,
+                           row_groups.size().first,
                            row_index_stride,
+                           level,
                            stream);
   chunks.device_to_host(stream, true);
 
   for (size_t i = 0; i < num_stripes; ++i) {
     for (size_t j = 0; j < num_columns; ++j) {
-      out_buffers[j].null_count() += chunks[i * num_columns + j].null_count;
+      out_buffers[j].null_count() += chunks[i][j].null_count;
     }
   }
 }
 
-reader::impl::impl(std::vector<std::unique_ptr<datasource>> &&sources,
-                   orc_reader_options const &options,
-                   rmm::mr::device_memory_resource *mr)
+// Aggregate child column metadata per stripe and per column
+void reader::impl::aggregate_child_meta(cudf::detail::host_2dspan<gpu::ColumnDesc> chunks,
+                                        cudf::detail::host_2dspan<gpu::RowGroup> row_groups,
+                                        std::vector<orc_column_meta> const& list_col,
+                                        const int32_t level)
+{
+  const auto num_of_stripes         = chunks.size().first;
+  const auto num_of_rowgroups       = row_groups.size().first;
+  const auto num_parent_cols        = _selected_columns[level].size();
+  const auto num_child_cols         = _selected_columns[level + 1].size();
+  const auto number_of_child_chunks = num_child_cols * num_of_stripes;
+  auto& num_child_rows              = _col_meta.num_child_rows;
+
+  // Reset the meta to store child column details.
+  num_child_rows.resize(_selected_columns[level + 1].size());
+  std::fill(num_child_rows.begin(), num_child_rows.end(), 0);
+  _col_meta.child_start_row.resize(number_of_child_chunks);
+  _col_meta.num_child_rows_per_stripe.resize(number_of_child_chunks);
+  _col_meta.rwgrp_meta.resize(num_of_rowgroups * num_child_cols);
+
+  auto child_start_row = cudf::detail::host_2dspan<uint32_t>(
+    _col_meta.child_start_row.data(), num_of_stripes, num_child_cols);
+  auto num_child_rows_per_stripe = cudf::detail::host_2dspan<uint32_t>(
+    _col_meta.num_child_rows_per_stripe.data(), num_of_stripes, num_child_cols);
+  auto rwgrp_meta = cudf::detail::host_2dspan<reader_column_meta::row_group_meta>(
+    _col_meta.rwgrp_meta.data(), num_of_rowgroups, num_child_cols);
+
+  int index = 0;  // number of child column processed
+
+  // For each parent column, update its child column meta for each stripe.
+  std::for_each(list_col.cbegin(), list_col.cend(), [&](const auto p_col) {
+    const auto parent_col_idx = _col_meta.orc_col_map[level][p_col.id];
+    auto start_row            = 0;
+    auto processed_row_groups = 0;
+
+    for (size_t stripe_id = 0; stripe_id < num_of_stripes; stripe_id++) {
+      // Aggregate num_rows and start_row from processed parent columns per row groups
+      if (num_of_rowgroups) {
+        auto stripe_num_row_groups = chunks[stripe_id][parent_col_idx].num_rowgroups;
+        auto processed_child_rows  = 0;
+
+        for (size_t rowgroup_id = 0; rowgroup_id < stripe_num_row_groups;
+             rowgroup_id++, processed_row_groups++) {
+          const auto child_rows = row_groups[processed_row_groups][parent_col_idx].num_child_rows;
+          for (uint32_t id = 0; id < p_col.num_children; id++) {
+            const auto child_col_idx                                  = index + id;
+            rwgrp_meta[processed_row_groups][child_col_idx].start_row = processed_child_rows;
+            rwgrp_meta[processed_row_groups][child_col_idx].num_rows  = child_rows;
+          }
+          processed_child_rows += child_rows;
+        }
+      }
+
+      // Aggregate start row, number of rows per chunk and total number of rows in a column
+      const auto child_rows = chunks[stripe_id][parent_col_idx].num_child_rows;
+      for (uint32_t id = 0; id < p_col.num_children; id++) {
+        const auto child_col_idx = index + id;
+
+        num_child_rows[child_col_idx] += child_rows;
+        num_child_rows_per_stripe[stripe_id][child_col_idx] = child_rows;
+        // start row could be different for each column when there is nesting at each stripe level
+        child_start_row[stripe_id][child_col_idx] = (stripe_id == 0) ? 0 : start_row;
+      }
+      start_row += child_rows;
+    }
+    index += p_col.num_children;
+  });
+}
+
+std::unique_ptr<column> reader::impl::create_empty_column(const int32_t orc_col_id,
+                                                          column_name_info& schema_info,
+                                                          rmm::cuda_stream_view stream)
+{
+  schema_info.name = _metadata->get_column_name(0, orc_col_id);
+  // If the column type is orc::DECIMAL see if the user
+  // desires it to be converted to float64 or not
+  auto const decimal_as_float64 = should_convert_decimal_column_to_float(
+    _decimal_cols_as_float, _metadata->per_file_metadata[0], orc_col_id);
+  auto const type = to_type_id(
+    _metadata->get_schema(orc_col_id), _use_np_dtypes, _timestamp_type.id(), decimal_as_float64);
+  int32_t scale = 0;
+  std::vector<std::unique_ptr<column>> child_columns;
+  std::unique_ptr<column> out_col = nullptr;
+
+  switch (type) {
+    case type_id::LIST:
+      schema_info.children.emplace_back("offsets");
+      schema_info.children.emplace_back("");
+      out_col = make_lists_column(
+        0,
+        make_empty_column(data_type(type_id::INT32)),
+        create_empty_column(
+          _metadata->get_col_type(orc_col_id).subtypes[0], schema_info.children.back(), stream),
+        0,
+        rmm::device_buffer{0, stream},
+        stream);
+
+      break;
+
+    case type_id::STRUCT:
+      for (const auto col : _metadata->get_col_type(orc_col_id).subtypes) {
+        schema_info.children.emplace_back("");
+        child_columns.push_back(create_empty_column(col, schema_info.children.back(), stream));
+      }
+      out_col =
+        make_structs_column(0, std::move(child_columns), 0, rmm::device_buffer{0, stream}, stream);
+      break;
+
+    case type_id::DECIMAL64:
+      scale   = -static_cast<int32_t>(_metadata->get_types()[orc_col_id].scale.value_or(0));
+      out_col = make_empty_column(data_type(type, scale));
+      break;
+
+    default: out_col = make_empty_column(data_type(type));
+  }
+
+  return out_col;
+}
+
+// Adds child column buffers to parent column
+column_buffer&& reader::impl::assemble_buffer(const int32_t orc_col_id,
+                                              std::vector<std::vector<column_buffer>>& col_buffers,
+                                              const size_t level)
+{
+  auto const col_id = _col_meta.orc_col_map[level][orc_col_id];
+  auto& col_buffer  = col_buffers[level][col_id];
+
+  col_buffer.name = _metadata->get_column_name(0, orc_col_id);
+  switch (col_buffer.type.id()) {
+    case type_id::LIST:
+      col_buffer.children.emplace_back(
+        assemble_buffer(_metadata->get_col_type(orc_col_id).subtypes[0], col_buffers, level + 1));
+      break;
+
+    case type_id::STRUCT:
+      for (auto const& col : _metadata->get_col_type(orc_col_id).subtypes) {
+        col_buffer.children.emplace_back(assemble_buffer(col, col_buffers, level));
+      }
+
+      break;
+
+    default: break;
+  }
+
+  return std::move(col_buffer);
+}
+
+// creates columns along with schema information for each column
+void reader::impl::create_columns(std::vector<std::vector<column_buffer>>&& col_buffers,
+                                  std::vector<std::unique_ptr<column>>& out_columns,
+                                  std::vector<column_name_info>& schema_info,
+                                  rmm::cuda_stream_view stream)
+{
+  for (size_t i = 0; i < _selected_columns[0].size();) {
+    auto const& col_meta = _selected_columns[0][i];
+    schema_info.emplace_back("");
+
+    auto col_buffer = assemble_buffer(col_meta.id, col_buffers, 0);
+    out_columns.emplace_back(make_column(col_buffer, &schema_info.back(), stream, _mr));
+
+    // Need to skip child columns of struct which are at the same level and have been processed
+    i += (col_buffers[0][i].type.id() == type_id::STRUCT) ? col_meta.num_children + 1 : 1;
+  }
+}
+
+reader::impl::impl(std::vector<std::unique_ptr<datasource>>&& sources,
+                   orc_reader_options const& options,
+                   rmm::mr::device_memory_resource* mr)
   : _mr(mr), _sources(std::move(sources))
 {
   // Open and parse the source(s) dataset metadata
   _metadata = std::make_unique<aggregate_orc_metadata>(_sources);
 
   // Select only columns required by the options
-  _selected_columns = _metadata->select_columns(options.get_columns(), _has_timestamp_column);
+  _selected_columns =
+    _metadata->select_columns(options.get_columns(), _has_timestamp_column, _has_list_column);
 
   // Override output timestamp resolution if requested
   if (options.get_timestamp_type().id() != type_id::EMPTY) {
@@ -674,272 +925,360 @@ reader::impl::impl(std::vector<std::unique_ptr<datasource>> &&sources,
 
 table_with_metadata reader::impl::read(size_type skip_rows,
                                        size_type num_rows,
-                                       const std::vector<std::vector<size_type>> &stripes,
+                                       const std::vector<std::vector<size_type>>& stripes,
                                        rmm::cuda_stream_view stream)
 {
+  CUDF_EXPECTS(skip_rows == 0 or (not _has_list_column),
+               "skip_rows is not supported by list column");
+
   std::vector<std::unique_ptr<column>> out_columns;
+  // buffer and stripe data are stored as per nesting level
+  std::vector<std::vector<column_buffer>> out_buffers(_selected_columns.size());
+  std::vector<column_name_info> schema_info;
+  std::vector<std::vector<rmm::device_buffer>> lvl_stripe_data(_selected_columns.size());
   table_metadata out_metadata;
 
-  // There are no columns in table
+  // There are no columns in the table
   if (_selected_columns.size() == 0) return {std::make_unique<table>(), std::move(out_metadata)};
 
   // Select only stripes required (aka row groups)
   const auto selected_stripes = _metadata->select_stripes(stripes, skip_rows, num_rows);
 
-  // Association between each ORC column and its cudf::column
-  std::vector<int32_t> orc_col_map(_metadata->get_num_cols(), -1);
-
-  // Get a list of column data types
-  std::vector<data_type> column_types;
-  for (const auto &col : _selected_columns) {
-    // If the column type is orc::DECIMAL see if the user
-    // desires it to be converted to float64 or not
-    auto const decimal_as_float64 = should_convert_decimal_column_to_float(
-      _decimal_cols_as_float, _metadata->per_file_metadata[0], col);
-
-    auto col_type = to_type_id(
-      _metadata->get_col_type(col), _use_np_dtypes, _timestamp_type.id(), decimal_as_float64);
-    CUDF_EXPECTS(col_type != type_id::EMPTY, "Unknown type");
-    // Remove this once we support Decimal128 data type
-    CUDF_EXPECTS((col_type != type_id::DECIMAL64) or (_metadata->get_col_type(col).precision <= 18),
-                 "Decimal data has precision > 18, Decimal64 data type doesn't support it.");
-    if (col_type == type_id::DECIMAL64) {
-      // sign of the scale is changed since cuDF follows c++ libraries like CNL
-      // which uses negative scaling, but liborc and other libraries
-      // follow positive scaling.
-      auto const scale = -static_cast<int32_t>(_metadata->get_col_type(col).scale.value_or(0));
-      column_types.emplace_back(col_type, scale);
-    } else {
-      column_types.emplace_back(col_type);
-    }
+  // Iterates through levels of nested columns, struct columns and its children will be
+  // in the same level since child column also have same number of rows,
+  // list column children will be 1 level down compared to parent.
+  for (size_t level = 0; level < _selected_columns.size(); level++) {
+    auto& selected_columns = _selected_columns[level];
+    // Association between each ORC column and its cudf::column
+    _col_meta.orc_col_map.emplace_back(_metadata->get_num_cols(), -1);
+    std::vector<orc_column_meta> list_col;
+
+    // Get a list of column data types
+    std::vector<data_type> column_types;
+    for (auto& col : selected_columns) {
+      // If the column type is orc::DECIMAL see if the user
+      // desires it to be converted to float64 or not
+      auto const decimal_as_float64 = should_convert_decimal_column_to_float(
+        _decimal_cols_as_float, _metadata->per_file_metadata[0], col.id);
+      auto col_type = to_type_id(
+        _metadata->get_col_type(col.id), _use_np_dtypes, _timestamp_type.id(), decimal_as_float64);
+      CUDF_EXPECTS(col_type != type_id::EMPTY, "Unknown type");
+      // Remove this once we support Decimal128 data type
+      CUDF_EXPECTS(
+        (col_type != type_id::DECIMAL64) or (_metadata->get_col_type(col.id).precision <= 18),
+        "Decimal data has precision > 18, Decimal64 data type doesn't support it.");
+      if (col_type == type_id::DECIMAL64) {
+        // sign of the scale is changed since cuDF follows c++ libraries like CNL
+        // which uses negative scaling, but liborc and other libraries
+        // follow positive scaling.
+        auto const scale = -static_cast<int32_t>(_metadata->get_col_type(col.id).scale.value_or(0));
+        column_types.emplace_back(col_type, scale);
+      } else {
+        column_types.emplace_back(col_type);
+      }
 
-    // Map each ORC column to its column
-    orc_col_map[col] = column_types.size() - 1;
-  }
+      // Map each ORC column to its column
+      _col_meta.orc_col_map[level][col.id] = column_types.size() - 1;
+      if (col_type == type_id::LIST) list_col.emplace_back(col);
+    }
 
-  // If no rows or stripes to read, return empty columns
-  if (num_rows <= 0 || selected_stripes.empty()) {
-    std::transform(column_types.cbegin(),
-                   column_types.cend(),
-                   std::back_inserter(out_columns),
-                   [](auto const &dtype) { return make_empty_column(dtype); });
-  } else {
-    // Get the total number of stripes across all input files.
-    size_t total_num_stripes =
-      std::accumulate(selected_stripes.begin(),
-                      selected_stripes.end(),
-                      0,
-                      [](size_t sum, auto &stripe_source_mapping) {
-                        return sum + stripe_source_mapping.stripe_info.size();
-                      });
-
-    const auto num_columns = _selected_columns.size();
-    const auto num_chunks  = total_num_stripes * num_columns;
-    hostdevice_vector<gpu::ColumnDesc> chunks(num_chunks, stream);
-    memset(chunks.host_ptr(), 0, chunks.memory_size());
-
-    const bool use_index =
-      (_use_index == true) &&
-      // Only use if we don't have much work with complete columns & stripes
-      // TODO: Consider nrows, gpu, and tune the threshold
-      (num_rows > _metadata->get_row_index_stride() && !(_metadata->get_row_index_stride() & 7) &&
-       _metadata->get_row_index_stride() > 0 && num_columns * total_num_stripes < 8 * 128) &&
-      // Only use if first row is aligned to a stripe boundary
-      // TODO: Fix logic to handle unaligned rows
-      (skip_rows == 0);
-
-    // Logically view streams as columns
-    std::vector<orc_stream_info> stream_info;
-
-    // Tracker for eventually deallocating compressed and uncompressed data
-    std::vector<rmm::device_buffer> stripe_data;
-
-    size_t stripe_start_row   = 0;
-    size_t num_dict_entries   = 0;
-    size_t num_rowgroups      = 0;
-    size_t stripe_chunk_index = 0;
-
-    for (auto &stripe_source_mapping : selected_stripes) {
-      // Iterate through the source files selected stripes
-      for (size_t stripe_pos_index = 0; stripe_pos_index < stripe_source_mapping.stripe_info.size();
-           stripe_pos_index++) {
-        auto &stripe_pair        = stripe_source_mapping.stripe_info[stripe_pos_index];
-        const auto stripe_info   = stripe_pair.first;
-        const auto stripe_footer = stripe_pair.second;
-
-        auto stream_count          = stream_info.size();
-        const auto total_data_size = gather_stream_info(stripe_chunk_index,
-                                                        stripe_info,
-                                                        stripe_footer,
-                                                        orc_col_map,
-                                                        _selected_columns,
-                                                        _metadata->get_types(),
-                                                        use_index,
-                                                        &num_dict_entries,
-                                                        chunks,
-                                                        stream_info);
-
-        CUDF_EXPECTS(total_data_size > 0, "Expected streams data within stripe");
-
-        stripe_data.emplace_back(total_data_size, stream);
-        auto dst_base = static_cast<uint8_t *>(stripe_data.back().data());
-
-        // Coalesce consecutive streams into one read
-        while (stream_count < stream_info.size()) {
-          const auto d_dst  = dst_base + stream_info[stream_count].dst_pos;
-          const auto offset = stream_info[stream_count].offset;
-          auto len          = stream_info[stream_count].length;
-          stream_count++;
-
-          while (stream_count < stream_info.size() &&
-                 stream_info[stream_count].offset == offset + len) {
-            len += stream_info[stream_count].length;
+    // If no rows or stripes to read, return empty columns
+    if (num_rows <= 0 || selected_stripes.empty()) {
+      for (size_t i = 0; i < _selected_columns[0].size();) {
+        auto const& col_meta = _selected_columns[0][i];
+        auto const schema    = _metadata->get_schema(col_meta.id);
+        schema_info.emplace_back("");
+        out_columns.push_back(
+          std::move(create_empty_column(col_meta.id, schema_info.back(), stream)));
+        // Since struct children will be in the same level, have to skip them.
+        i += (schema.kind == orc::STRUCT) ? col_meta.num_children + 1 : 1;
+      }
+      break;
+    } else {
+      // Get the total number of stripes across all input files.
+      size_t total_num_stripes =
+        std::accumulate(selected_stripes.begin(),
+                        selected_stripes.end(),
+                        0,
+                        [](size_t sum, auto& stripe_source_mapping) {
+                          return sum + stripe_source_mapping.stripe_info.size();
+                        });
+      const auto num_columns = selected_columns.size();
+      cudf::detail::hostdevice_2dvector<gpu::ColumnDesc> chunks(
+        total_num_stripes, num_columns, stream);
+      memset(chunks.base_host_ptr(), 0, chunks.memory_size());
+
+      const bool use_index =
+        (_use_index == true) &&
+        // Only use if we don't have much work with complete columns & stripes
+        // TODO: Consider nrows, gpu, and tune the threshold
+        (num_rows > _metadata->get_row_index_stride() && !(_metadata->get_row_index_stride() & 7) &&
+         _metadata->get_row_index_stride() > 0 && num_columns * total_num_stripes < 8 * 128) &&
+        // Only use if first row is aligned to a stripe boundary
+        // TODO: Fix logic to handle unaligned rows
+        (skip_rows == 0);
+
+      // Logically view streams as columns
+      std::vector<orc_stream_info> stream_info;
+
+      // Tracker for eventually deallocating compressed and uncompressed data
+      auto& stripe_data = lvl_stripe_data[level];
+
+      size_t stripe_start_row = 0;
+      size_t num_dict_entries = 0;
+      size_t num_rowgroups    = 0;
+      int stripe_idx          = 0;
+
+      for (auto const& stripe_source_mapping : selected_stripes) {
+        // Iterate through the source files selected stripes
+        for (auto const& stripe : stripe_source_mapping.stripe_info) {
+          const auto stripe_info   = stripe.first;
+          const auto stripe_footer = stripe.second;
+
+          auto stream_count          = stream_info.size();
+          const auto total_data_size = gather_stream_info(stripe_idx,
+                                                          stripe_info,
+                                                          stripe_footer,
+                                                          _col_meta.orc_col_map[level],
+                                                          selected_columns,
+                                                          _metadata->get_types(),
+                                                          use_index,
+                                                          &num_dict_entries,
+                                                          chunks,
+                                                          stream_info);
+
+          CUDF_EXPECTS(total_data_size > 0, "Expected streams data within stripe");
+
+          stripe_data.emplace_back(total_data_size, stream);
+          auto dst_base = static_cast<uint8_t*>(stripe_data.back().data());
+
+          // Coalesce consecutive streams into one read
+          while (stream_count < stream_info.size()) {
+            const auto d_dst  = dst_base + stream_info[stream_count].dst_pos;
+            const auto offset = stream_info[stream_count].offset;
+            auto len          = stream_info[stream_count].length;
             stream_count++;
+
+            while (stream_count < stream_info.size() &&
+                   stream_info[stream_count].offset == offset + len) {
+              len += stream_info[stream_count].length;
+              stream_count++;
+            }
+            if (_metadata->per_file_metadata[stripe_source_mapping.source_idx]
+                  .source->is_device_read_preferred(len)) {
+              CUDF_EXPECTS(
+                _metadata->per_file_metadata[stripe_source_mapping.source_idx].source->device_read(
+                  offset, len, d_dst, stream) == len,
+                "Unexpected discrepancy in bytes read.");
+            } else {
+              const auto buffer =
+                _metadata->per_file_metadata[stripe_source_mapping.source_idx].source->host_read(
+                  offset, len);
+              CUDF_EXPECTS(buffer->size() == len, "Unexpected discrepancy in bytes read.");
+              CUDA_TRY(cudaMemcpyAsync(
+                d_dst, buffer->data(), len, cudaMemcpyHostToDevice, stream.value()));
+              stream.synchronize();
+            }
           }
-          if (_metadata->per_file_metadata[stripe_source_mapping.source_idx]
-                .source->is_device_read_preferred(len)) {
-            CUDF_EXPECTS(
-              _metadata->per_file_metadata[stripe_source_mapping.source_idx].source->device_read(
-                offset, len, d_dst, stream) == len,
-              "Unexpected discrepancy in bytes read.");
-          } else {
-            const auto buffer =
-              _metadata->per_file_metadata[stripe_source_mapping.source_idx].source->host_read(
-                offset, len);
-            CUDF_EXPECTS(buffer->size() == len, "Unexpected discrepancy in bytes read.");
-            CUDA_TRY(
-              cudaMemcpyAsync(d_dst, buffer->data(), len, cudaMemcpyHostToDevice, stream.value()));
-            stream.synchronize();
-          }
-        }
 
-        // Update chunks to reference streams pointers
-        for (size_t col_idx = 0; col_idx < num_columns; col_idx++) {
-          auto &chunk         = chunks[stripe_chunk_index * num_columns + col_idx];
-          chunk.start_row     = stripe_start_row;
-          chunk.num_rows      = stripe_info->numberOfRows;
-          chunk.encoding_kind = stripe_footer->columns[_selected_columns[col_idx]].kind;
-          chunk.type_kind     = _metadata->per_file_metadata[stripe_source_mapping.source_idx]
-                              .ff.types[_selected_columns[col_idx]]
-                              .kind;
-          auto const decimal_as_float64 = should_convert_decimal_column_to_float(
-            _decimal_cols_as_float, _metadata->per_file_metadata[0], _selected_columns[col_idx]);
-          chunk.decimal_scale = _metadata->per_file_metadata[stripe_source_mapping.source_idx]
-                                  .ff.types[_selected_columns[col_idx]]
-                                  .scale.value_or(0) |
-                                (decimal_as_float64 ? orc::gpu::orc_decimal2float64_scale : 0);
-          chunk.rowgroup_id = num_rowgroups;
-          chunk.dtype_len   = (column_types[col_idx].id() == type_id::STRING)
-                              ? sizeof(string_index_pair)
-                              : cudf::size_of(column_types[col_idx]);
-          if (chunk.type_kind == orc::TIMESTAMP) {
-            chunk.ts_clock_rate = to_clockrate(_timestamp_type.id());
+          const auto num_rows_per_stripe = stripe_info->numberOfRows;
+          const auto rowgroup_id         = num_rowgroups;
+          auto stripe_num_rowgroups      = 0;
+          if (use_index) {
+            stripe_num_rowgroups = (num_rows_per_stripe + _metadata->get_row_index_stride() - 1) /
+                                   _metadata->get_row_index_stride();
           }
-          for (int k = 0; k < gpu::CI_NUM_STREAMS; k++) {
-            chunk.streams[k] = dst_base + stream_info[chunk.strm_id[k]].dst_pos;
+          // Update chunks to reference streams pointers
+          for (size_t col_idx = 0; col_idx < num_columns; col_idx++) {
+            auto& chunk = chunks[stripe_idx][col_idx];
+            // start row, number of rows in a each stripe and total number of rows
+            // may change in lower levels of nesting
+            chunk.start_row = (level == 0)
+                                ? stripe_start_row
+                                : _col_meta.child_start_row[stripe_idx * num_columns + col_idx];
+            chunk.num_rows =
+              (level == 0)
+                ? stripe_info->numberOfRows
+                : _col_meta.num_child_rows_per_stripe[stripe_idx * num_columns + col_idx];
+            chunk.column_num_rows = (level == 0) ? num_rows : _col_meta.num_child_rows[col_idx];
+            chunk.encoding_kind   = stripe_footer->columns[selected_columns[col_idx].id].kind;
+            chunk.type_kind       = _metadata->per_file_metadata[stripe_source_mapping.source_idx]
+                                .ff.types[selected_columns[col_idx].id]
+                                .kind;
+            auto const decimal_as_float64 =
+              should_convert_decimal_column_to_float(_decimal_cols_as_float,
+                                                     _metadata->per_file_metadata[0],
+                                                     selected_columns[col_idx].id);
+            chunk.decimal_scale = _metadata->per_file_metadata[stripe_source_mapping.source_idx]
+                                    .ff.types[selected_columns[col_idx].id]
+                                    .scale.value_or(0) |
+                                  (decimal_as_float64 ? orc::gpu::orc_decimal2float64_scale : 0);
+
+            chunk.rowgroup_id   = rowgroup_id;
+            chunk.dtype_len     = (column_types[col_idx].id() == type_id::STRING)
+                                    ? sizeof(string_index_pair)
+                                  : ((column_types[col_idx].id() == type_id::LIST) or
+                                 (column_types[col_idx].id() == type_id::STRUCT))
+                                    ? sizeof(int32_t)
+                                    : cudf::size_of(column_types[col_idx]);
+            chunk.num_rowgroups = stripe_num_rowgroups;
+            if (chunk.type_kind == orc::TIMESTAMP) {
+              chunk.ts_clock_rate = to_clockrate(_timestamp_type.id());
+            }
+            for (int k = 0; k < gpu::CI_NUM_STREAMS; k++) {
+              chunk.streams[k] = dst_base + stream_info[chunk.strm_id[k]].dst_pos;
+            }
           }
-        }
+          stripe_start_row += num_rows_per_stripe;
+          num_rowgroups += stripe_num_rowgroups;
 
-        stripe_start_row += stripe_info->numberOfRows;
-        if (use_index) {
-          num_rowgroups += (stripe_info->numberOfRows + _metadata->get_row_index_stride() - 1) /
-                           _metadata->get_row_index_stride();
+          stripe_idx++;
         }
-        stripe_chunk_index++;
       }
-    }
 
-    // Process dataset chunk pages into output columns
-    if (stripe_data.size() != 0) {
-      // Setup row group descriptors if using indexes
-      rmm::device_uvector<gpu::RowGroup> row_groups(num_rowgroups * num_columns, stream);
-      if (_metadata->per_file_metadata[0].ps.compression != orc::NONE) {
-        auto decomp_data =
-          decompress_stripe_data(chunks,
-                                 stripe_data,
-                                 _metadata->per_file_metadata[0].decompressor.get(),
-                                 stream_info,
-                                 total_num_stripes,
-                                 row_groups,
-                                 _metadata->get_row_index_stride(),
-                                 stream);
-        stripe_data.clear();
-        stripe_data.push_back(std::move(decomp_data));
-      } else {
-        if (not row_groups.is_empty()) {
-          chunks.host_to_device(stream);
-          gpu::ParseRowGroupIndex(row_groups.data(),
-                                  nullptr,
-                                  chunks.device_ptr(),
-                                  num_columns,
-                                  total_num_stripes,
-                                  num_rowgroups,
-                                  _metadata->get_row_index_stride(),
-                                  stream);
+      // Process dataset chunk pages into output columns
+      if (stripe_data.size() != 0) {
+        auto row_groups =
+          cudf::detail::hostdevice_2dvector<gpu::RowGroup>(num_rowgroups, num_columns, stream);
+        if (level > 0 and row_groups.size().first) {
+          cudf::host_span<gpu::RowGroup> row_groups_span(row_groups.base_host_ptr(),
+                                                         num_rowgroups * num_columns);
+          auto& rw_grp_meta = _col_meta.rwgrp_meta;
+
+          // Update start row and num rows per row group
+          std::transform(rw_grp_meta.begin(),
+                         rw_grp_meta.end(),
+                         row_groups_span.begin(),
+                         rw_grp_meta.begin(),
+                         [&](auto meta, auto& row_grp) {
+                           row_grp.num_rows  = meta.num_rows;
+                           row_grp.start_row = meta.start_row;
+                           return meta;
+                         });
+        }
+        // Setup row group descriptors if using indexes
+        if (_metadata->per_file_metadata[0].ps.compression != orc::NONE) {
+          auto decomp_data =
+            decompress_stripe_data(chunks,
+                                   stripe_data,
+                                   _metadata->per_file_metadata[0].decompressor.get(),
+                                   stream_info,
+                                   total_num_stripes,
+                                   row_groups,
+                                   _metadata->get_row_index_stride(),
+                                   level == 0,
+                                   stream);
+          stripe_data.clear();
+          stripe_data.push_back(std::move(decomp_data));
+        } else {
+          if (row_groups.size().first) {
+            chunks.host_to_device(stream);
+            row_groups.host_to_device(stream);
+            gpu::ParseRowGroupIndex(row_groups.base_device_ptr(),
+                                    nullptr,
+                                    chunks.base_device_ptr(),
+                                    num_columns,
+                                    total_num_stripes,
+                                    num_rowgroups,
+                                    _metadata->get_row_index_stride(),
+                                    level == 0,
+                                    stream);
+          }
         }
-      }
 
-      // Setup table for converting timestamp columns from local to UTC time
-      auto const tz_table = _has_timestamp_column
-                              ? build_timezone_transition_table(
-                                  selected_stripes[0].stripe_info[0].second->writerTimezone, stream)
-                              : timezone_table{};
-
-      std::vector<column_buffer> out_buffers;
-      for (size_t i = 0; i < column_types.size(); ++i) {
-        bool is_nullable = false;
-        for (size_t j = 0; j < total_num_stripes; ++j) {
-          if (chunks[j * num_columns + i].strm_len[gpu::CI_PRESENT] != 0) {
-            is_nullable = true;
-            break;
+        // Setup table for converting timestamp columns from local to UTC time
+        auto const tz_table =
+          _has_timestamp_column
+            ? build_timezone_transition_table(
+                selected_stripes[0].stripe_info[0].second->writerTimezone, stream)
+            : timezone_table{};
+
+        for (size_t i = 0; i < column_types.size(); ++i) {
+          bool is_nullable = false;
+          for (size_t j = 0; j < total_num_stripes; ++j) {
+            if (chunks[j][i].strm_len[gpu::CI_PRESENT] != 0) {
+              is_nullable = true;
+              break;
+            }
           }
+          auto is_list_type = (column_types[i].id() == type_id::LIST);
+          auto n_rows       = (level == 0) ? num_rows : _col_meta.num_child_rows[i];
+          // For list column, offset column will be always size + 1
+          if (is_list_type) n_rows++;
+          out_buffers[level].emplace_back(column_types[i], n_rows, is_nullable, stream, _mr);
         }
-        out_buffers.emplace_back(column_types[i], num_rows, is_nullable, stream, _mr);
-      }
 
-      decode_stream_data(chunks,
-                         num_dict_entries,
-                         skip_rows,
-                         num_rows,
-                         tz_table.view(),
-                         row_groups,
-                         _metadata->get_row_index_stride(),
-                         out_buffers,
-                         stream);
-
-      for (size_t i = 0; i < column_types.size(); ++i) {
-        out_columns.emplace_back(make_column(out_buffers[i], nullptr, stream, _mr));
+        decode_stream_data(chunks,
+                           num_dict_entries,
+                           skip_rows,
+                           tz_table.view(),
+                           row_groups,
+                           _metadata->get_row_index_stride(),
+                           out_buffers[level],
+                           level,
+                           stream);
+
+        // Extract information to process list child columns
+        if (list_col.size()) {
+          row_groups.device_to_host(stream, true);
+          aggregate_child_meta(chunks, row_groups, list_col, level);
+        }
+
+        // ORC stores number of elements at each row, so we need to generate offsets from that
+        if (list_col.size()) {
+          std::vector<list_buffer_data> buff_data;
+          std::for_each(
+            out_buffers[level].begin(), out_buffers[level].end(), [&buff_data](auto& out_buffer) {
+              if (out_buffer.type.id() == type_id::LIST) {
+                auto data = static_cast<size_type*>(out_buffer.data());
+                buff_data.emplace_back(list_buffer_data{data, out_buffer.size});
+              }
+            });
+
+          auto const dev_buff_data = cudf::detail::make_device_uvector_async(buff_data, stream);
+          generate_offsets_for_list(dev_buff_data, stream);
+        }
       }
     }
   }
 
-  // Return column names (must match order of returned columns)
-  out_metadata.column_names.resize(_selected_columns.size());
-  for (size_t i = 0; i < _selected_columns.size(); i++) {
-    out_metadata.column_names[i] = _metadata->get_column_name(0, _selected_columns[i]);
+  // If out_columns is empty, then create columns from buffer.
+  if (out_columns.empty()) {
+    create_columns(std::move(out_buffers), out_columns, schema_info, stream);
   }
 
-  for (const auto &meta : _metadata->per_file_metadata) {
-    for (const auto &kv : meta.ff.metadata) { out_metadata.user_data.insert({kv.name, kv.value}); }
+  // Return column names (must match order of returned columns)
+  out_metadata.column_names.reserve(schema_info.size());
+  std::transform(schema_info.cbegin(),
+                 schema_info.cend(),
+                 std::back_inserter(out_metadata.column_names),
+                 [](auto info) { return info.name; });
+
+  out_metadata.schema_info = std::move(schema_info);
+
+  for (const auto& meta : _metadata->per_file_metadata) {
+    for (const auto& kv : meta.ff.metadata) {
+      out_metadata.user_data.insert({kv.name, kv.value});
+    }
   }
 
   return {std::make_unique<table>(std::move(out_columns)), std::move(out_metadata)};
 }
 
 // Forward to implementation
-reader::reader(std::vector<std::string> const &filepaths,
-               orc_reader_options const &options,
+reader::reader(std::vector<std::string> const& filepaths,
+               orc_reader_options const& options,
                rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource *mr)
+               rmm::mr::device_memory_resource* mr)
 {
   _impl = std::make_unique<impl>(datasource::create(filepaths), options, mr);
 }
 
 // Forward to implementation
-reader::reader(std::vector<std::unique_ptr<cudf::io::datasource>> &&sources,
-               orc_reader_options const &options,
+reader::reader(std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
+               orc_reader_options const& options,
                rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource *mr)
+               rmm::mr::device_memory_resource* mr)
 {
   _impl = std::make_unique<impl>(std::move(sources), options, mr);
 }
@@ -948,7 +1287,7 @@ reader::reader(std::vector<std::unique_ptr<cudf::io::datasource>> &&sources,
 reader::~reader() = default;
 
 // Forward to implementation
-table_with_metadata reader::read(orc_reader_options const &options, rmm::cuda_stream_view stream)
+table_with_metadata reader::read(orc_reader_options const& options, rmm::cuda_stream_view stream)
 {
   return _impl->read(
     options.get_skip_rows(), options.get_num_rows(), options.get_stripes(), stream);
diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp
index 0307d84cd1b..1769fb6f193 100644
--- a/cpp/src/io/orc/reader_impl.hpp
+++ b/cpp/src/io/orc/reader_impl.hpp
@@ -48,6 +48,24 @@ struct stripe_source_mapping;
 }  // namespace
 class aggregate_orc_metadata;
 
+/**
+ * @brief Keeps track of orc mapping and child column details.
+ */
+struct reader_column_meta {
+  std::vector<std::vector<int32_t>>
+    orc_col_map;                          // Mapping between column id in orc to processing order.
+  std::vector<uint32_t> num_child_rows;   // number of rows in child columns
+  std::vector<uint32_t> child_start_row;  // start row of child columns [stripe][column]
+  std::vector<uint32_t>
+    num_child_rows_per_stripe;  // number of rows of child columns [stripe][column]
+  struct row_group_meta {
+    uint32_t num_rows;   // number of rows in a column in a row group
+    uint32_t start_row;  // start row in a column in a row group
+  };
+  // num_rowgroups * num_columns
+  std::vector<row_group_meta> rwgrp_meta;  // rowgroup metadata [rowgroup][column]
+};
+
 /**
  * @brief Implementation for ORC reader
  */
@@ -60,9 +78,9 @@ class reader::impl {
    * @param options Settings for controlling reading behavior
    * @param mr Device memory resource to use for device memory allocation
    */
-  explicit impl(std::vector<std::unique_ptr<datasource>> &&sources,
-                orc_reader_options const &options,
-                rmm::mr::device_memory_resource *mr);
+  explicit impl(std::vector<std::unique_ptr<datasource>>&& sources,
+                orc_reader_options const& options,
+                rmm::mr::device_memory_resource* mr);
 
   /**
    * @brief Read an entire set or a subset of data and returns a set of columns
@@ -76,68 +94,123 @@ class reader::impl {
    */
   table_with_metadata read(size_type skip_rows,
                            size_type num_rows,
-                           const std::vector<std::vector<size_type>> &stripes,
+                           const std::vector<std::vector<size_type>>& stripes,
                            rmm::cuda_stream_view stream);
 
  private:
   /**
    * @brief Decompresses the stripe data, at stream granularity
    *
-   * @param chunks List of column chunk descriptors
+   * @param chunks Vector of list of column chunk descriptors
    * @param stripe_data List of source stripe column data
    * @param decompressor Originally host decompressor
    * @param stream_info List of stream to column mappings
    * @param num_stripes Number of stripes making up column chunks
-   * @param row_groups List of row index descriptors
+   * @param row_groups Vector of list of row index descriptors
    * @param row_index_stride Distance between each row index
+   * @param use_base_stride Whether to use base stride obtained from meta or use the computed value
    * @param stream CUDA stream used for device memory operations and kernel launches.
    *
    * @return Device buffer to decompressed page data
    */
-  rmm::device_buffer decompress_stripe_data(hostdevice_vector<gpu::ColumnDesc> &chunks,
-                                            const std::vector<rmm::device_buffer> &stripe_data,
-                                            const OrcDecompressor *decompressor,
-                                            std::vector<orc_stream_info> &stream_info,
-                                            size_t num_stripes,
-                                            device_span<gpu::RowGroup> row_groups,
-                                            size_t row_index_stride,
-                                            rmm::cuda_stream_view stream);
+  rmm::device_buffer decompress_stripe_data(
+    cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks,
+    const std::vector<rmm::device_buffer>& stripe_data,
+    const OrcDecompressor* decompressor,
+    std::vector<orc_stream_info>& stream_info,
+    size_t num_stripes,
+    cudf::detail::hostdevice_2dvector<gpu::RowGroup>& row_groups,
+    size_t row_index_stride,
+    bool use_base_stride,
+    rmm::cuda_stream_view stream);
 
   /**
    * @brief Converts the stripe column data and outputs to columns
    *
-   * @param chunks List of column chunk descriptors
+   * @param chunks Vector of list of column chunk descriptors
    * @param num_dicts Number of dictionary entries required
    * @param skip_rows Number of rows to offset from start
-   * @param num_rows Number of rows to output
    * @param tz_table Local time to UTC conversion table
-   * @param row_groups List of row index descriptors
+   * @param row_groups Vector of list of row index descriptors
    * @param row_index_stride Distance between each row index
    * @param out_buffers Output columns' device buffers
+   * @param level Current nesting level being processed
    * @param stream CUDA stream used for device memory operations and kernel launches.
    */
-  void decode_stream_data(hostdevice_vector<gpu::ColumnDesc> &chunks,
+  void decode_stream_data(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks,
                           size_t num_dicts,
                           size_t skip_rows,
-                          size_t num_rows,
                           timezone_table_view tz_table,
-                          device_span<gpu::RowGroup const> row_groups,
+                          cudf::detail::hostdevice_2dvector<gpu::RowGroup>& row_groups,
                           size_t row_index_stride,
-                          std::vector<column_buffer> &out_buffers,
+                          std::vector<column_buffer>& out_buffers,
+                          size_t level,
                           rmm::cuda_stream_view stream);
 
+  /**
+   * @brief Aggregate child metadata from parent column chunks.
+   *
+   * @param chunks Vector of list of parent column chunks.
+   * @param chunks Vector of list of parent column row groups.
+   * @param list_col Vector of column metadata of list type parent columns.
+   * @param level Current nesting level being processed.
+   */
+  void aggregate_child_meta(cudf::detail::host_2dspan<gpu::ColumnDesc> chunks,
+                            cudf::detail::host_2dspan<gpu::RowGroup> row_groups,
+                            std::vector<orc_column_meta> const& list_col,
+                            const int32_t level);
+
+  /**
+   * @brief Assemble the buffer with child columns.
+   *
+   * @param orc_col_id Column id in orc.
+   * @param col_buffers Column buffers for columns and children.
+   * @param level Current nesting level.
+   */
+  column_buffer&& assemble_buffer(const int32_t orc_col_id,
+                                  std::vector<std::vector<column_buffer>>& col_buffers,
+                                  const size_t level);
+
+  /**
+   * @brief Create columns and respective schema information from the buffer.
+   *
+   * @param col_buffers Column buffers for columns and children.
+   * @param out_columns Vector of columns formed from column buffers.
+   * @param schema_info Vector of schema information formed from column buffers.
+   * @param stream CUDA stream used for device memory operations and kernel launches.
+   */
+  void create_columns(std::vector<std::vector<column_buffer>>&& col_buffers,
+                      std::vector<std::unique_ptr<column>>& out_columns,
+                      std::vector<column_name_info>& schema_info,
+                      rmm::cuda_stream_view stream);
+
+  /**
+   * @brief Create empty columns and respective schema information from the buffer.
+   *
+   * @param col_buffers Column buffers for columns and children.
+   * @param schema_info Vector of schema information formed from column buffers.
+   * @param stream CUDA stream used for device memory operations and kernel launches.
+   *
+   * @return An empty column equivalent to orc column type.
+   */
+  std::unique_ptr<column> create_empty_column(const int32_t orc_col_id,
+                                              column_name_info& schema_info,
+                                              rmm::cuda_stream_view stream);
+
  private:
-  rmm::mr::device_memory_resource *_mr = nullptr;
+  rmm::mr::device_memory_resource* _mr = nullptr;
   std::vector<std::unique_ptr<datasource>> _sources;
   std::unique_ptr<aggregate_orc_metadata> _metadata;
   // _output_columns associated schema indices
-  std::vector<int> _selected_columns;
+  std::vector<std::vector<orc_column_meta>> _selected_columns;
 
   bool _use_index            = true;
   bool _use_np_dtypes        = true;
   bool _has_timestamp_column = false;
+  bool _has_list_column      = false;
   std::vector<std::string> _decimal_cols_as_float;
   data_type _timestamp_type{type_id::EMPTY};
+  reader_column_meta _col_meta;
 };
 
 }  // namespace orc
diff --git a/cpp/src/io/orc/stats_enc.cu b/cpp/src/io/orc/stats_enc.cu
index 4c85150a9f0..517a1e0e689 100644
--- a/cpp/src/io/orc/stats_enc.cu
+++ b/cpp/src/io/orc/stats_enc.cu
@@ -39,8 +39,8 @@ constexpr unsigned int init_groups_per_block  = 4;
 constexpr unsigned int init_threads_per_block = init_threads_per_group * init_groups_per_block;
 
 __global__ void __launch_bounds__(init_threads_per_block)
-  gpu_init_statistics_groups(statistics_group *groups,
-                             const stats_column_desc *cols,
+  gpu_init_statistics_groups(statistics_group* groups,
+                             const stats_column_desc* cols,
                              uint32_t num_columns,
                              uint32_t num_rowgroups,
                              uint32_t row_index_stride)
@@ -49,7 +49,7 @@ __global__ void __launch_bounds__(init_threads_per_block)
   uint32_t col_id         = blockIdx.y;
   uint32_t chunk_id       = (blockIdx.x * init_groups_per_block) + threadIdx.y;
   uint32_t t              = threadIdx.x;
-  statistics_group *group = &group_g[threadIdx.y];
+  statistics_group* group = &group_g[threadIdx.y];
   if (chunk_id < num_rowgroups and t == 0) {
     uint32_t num_rows = cols[col_id].leaf_column->size();
     group->col        = &cols[col_id];
@@ -78,8 +78,8 @@ constexpr unsigned int pb_fldlen_common  = 2 * pb_fld_hdrlen + pb_fldlen_int64;
 
 template <unsigned int block_size>
 __global__ void __launch_bounds__(block_size, 1)
-  gpu_init_statistics_buffersize(statistics_merge_group *groups,
-                                 const statistics_chunk *chunks,
+  gpu_init_statistics_buffersize(statistics_merge_group* groups,
+                                 const statistics_chunk* chunks,
                                  uint32_t statistics_count)
 {
   using block_scan = cub::BlockScan<uint32_t, block_size, cub::BLOCK_SCAN_WARP_SCANS>;
@@ -91,7 +91,7 @@ __global__ void __launch_bounds__(block_size, 1)
     uint32_t stats_len = 0, stats_pos;
     uint32_t idx       = start + t;
     if (idx < statistics_count) {
-      const stats_column_desc *col = groups[idx].col;
+      const stats_column_desc* col = groups[idx].col;
       statistics_dtype dtype       = col->stats_dtype;
       switch (dtype) {
         case dtype_bool: stats_len = pb_fldlen_common + pb_fld_hdrlen + pb_fldlen_bucket1; break;
@@ -131,8 +131,8 @@ __global__ void __launch_bounds__(block_size, 1)
 }
 
 struct stats_state_s {
-  uint8_t *base;  ///< Output buffer start
-  uint8_t *end;   ///< Output buffer end
+  uint8_t* base;  ///< Output buffer start
+  uint8_t* end;   ///< Output buffer end
   statistics_chunk chunk;
   statistics_merge_group group;
   stats_column_desc col;
@@ -146,7 +146,7 @@ struct stats_state_s {
  * https://developers.google.com/protocol-buffers/docs/encoding
  */
 // Protobuf varint encoding for unsigned int
-__device__ inline uint8_t *pb_encode_uint(uint8_t *p, uint64_t v)
+__device__ inline uint8_t* pb_encode_uint(uint8_t* p, uint64_t v)
 {
   while (v > 0x7f) {
     *p++ = ((uint32_t)v | 0x80);
@@ -157,30 +157,30 @@ __device__ inline uint8_t *pb_encode_uint(uint8_t *p, uint64_t v)
 }
 
 // Protobuf field encoding for unsigned int
-__device__ inline uint8_t *pb_put_uint(uint8_t *p, uint32_t id, uint64_t v)
+__device__ inline uint8_t* pb_put_uint(uint8_t* p, uint32_t id, uint64_t v)
 {
   p[0] = id * 8 + PB_TYPE_VARINT;  // NOTE: Assumes id < 16
   return pb_encode_uint(p + 1, v);
 }
 
 // Protobuf field encoding for signed int
-__device__ inline uint8_t *pb_put_int(uint8_t *p, uint32_t id, int64_t v)
+__device__ inline uint8_t* pb_put_int(uint8_t* p, uint32_t id, int64_t v)
 {
   int64_t s = (v < 0);
   return pb_put_uint(p, id, (v ^ -s) * 2 + s);
 }
 
 // Protobuf field encoding for 'packed' unsigned int (single value)
-__device__ inline uint8_t *pb_put_packed_uint(uint8_t *p, uint32_t id, uint64_t v)
+__device__ inline uint8_t* pb_put_packed_uint(uint8_t* p, uint32_t id, uint64_t v)
 {
-  uint8_t *p2 = pb_encode_uint(p + 2, v);
+  uint8_t* p2 = pb_encode_uint(p + 2, v);
   p[0]        = id * 8 + PB_TYPE_FIXEDLEN;
   p[1]        = static_cast<uint8_t>(p2 - (p + 2));
   return p2;
 }
 
 // Protobuf field encoding for binary/string
-__device__ inline uint8_t *pb_put_binary(uint8_t *p, uint32_t id, const void *bytes, uint32_t len)
+__device__ inline uint8_t* pb_put_binary(uint8_t* p, uint32_t id, const void* bytes, uint32_t len)
 {
   p[0] = id * 8 + PB_TYPE_FIXEDLEN;
   p    = pb_encode_uint(p + 1, len);
@@ -189,7 +189,7 @@ __device__ inline uint8_t *pb_put_binary(uint8_t *p, uint32_t id, const void *by
 }
 
 // Protobuf field encoding for 64-bit raw encoding (double)
-__device__ inline uint8_t *pb_put_fixed64(uint8_t *p, uint32_t id, const void *raw64)
+__device__ inline uint8_t* pb_put_fixed64(uint8_t* p, uint32_t id, const void* raw64)
 {
   p[0] = id * 8 + PB_TYPE_FIXED64;
   memcpy(p + 1, raw64, 8);
@@ -226,15 +226,15 @@ constexpr unsigned int encode_threads_per_block =
   encode_threads_per_chunk * encode_chunks_per_block;
 
 __global__ void __launch_bounds__(encode_threads_per_block)
-  gpu_encode_statistics(uint8_t *blob_bfr,
-                        statistics_merge_group *groups,
-                        const statistics_chunk *chunks,
+  gpu_encode_statistics(uint8_t* blob_bfr,
+                        statistics_merge_group* groups,
+                        const statistics_chunk* chunks,
                         uint32_t statistics_count)
 {
   __shared__ __align__(8) stats_state_s state_g[encode_chunks_per_block];
   uint32_t t             = threadIdx.x;
   uint32_t idx           = blockIdx.x * encode_chunks_per_block + threadIdx.y;
-  stats_state_s *const s = &state_g[threadIdx.y];
+  stats_state_s* const s = &state_g[threadIdx.y];
 
   // Encode and update actual bfr size
   if (idx < statistics_count && t == 0) {
@@ -243,8 +243,8 @@ __global__ void __launch_bounds__(encode_threads_per_block)
     s->col             = *(s->group.col);
     s->base            = blob_bfr + s->group.start_chunk;
     s->end             = blob_bfr + s->group.start_chunk + s->group.num_chunks;
-    uint8_t *cur       = pb_put_uint(s->base, 1, s->chunk.non_nulls);
-    uint8_t *fld_start = cur;
+    uint8_t* cur       = pb_put_uint(s->base, 1, s->chunk.non_nulls);
+    uint8_t* fld_start = cur;
     switch (s->col.stats_dtype) {
       case dtype_int8:
       case dtype_int16:
@@ -373,8 +373,8 @@ __global__ void __launch_bounds__(encode_threads_per_block)
  * @param[in] row_index_stride Rowgroup size in rows
  * @param[in] stream CUDA stream to use, default `rmm::cuda_stream_default`
  */
-void orc_init_statistics_groups(statistics_group *groups,
-                                const stats_column_desc *cols,
+void orc_init_statistics_groups(statistics_group* groups,
+                                const stats_column_desc* cols,
                                 uint32_t num_columns,
                                 uint32_t num_rowgroups,
                                 uint32_t row_index_stride,
@@ -394,8 +394,8 @@ void orc_init_statistics_groups(statistics_group *groups,
  * @param[in] statistics_count Number of statistics buffers to encode
  * @param[in] stream CUDA stream to use, default `rmm::cuda_stream_default`
  */
-void orc_init_statistics_buffersize(statistics_merge_group *groups,
-                                    const statistics_chunk *chunks,
+void orc_init_statistics_buffersize(statistics_merge_group* groups,
+                                    const statistics_chunk* chunks,
                                     uint32_t statistics_count,
                                     rmm::cuda_stream_view stream)
 {
@@ -411,9 +411,9 @@ void orc_init_statistics_buffersize(statistics_merge_group *groups,
  * @param[in,out] chunks Statistics data
  * @param[in] statistics_count Number of statistics buffers
  */
-void orc_encode_statistics(uint8_t *blob_bfr,
-                           statistics_merge_group *groups,
-                           const statistics_chunk *chunks,
+void orc_encode_statistics(uint8_t* blob_bfr,
+                           statistics_merge_group* groups,
+                           const statistics_chunk* chunks,
                            uint32_t statistics_count,
                            rmm::cuda_stream_view stream)
 {
diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu
index 6bc0e475a27..903f9475e2a 100644
--- a/cpp/src/io/orc/stripe_data.cu
+++ b/cpp/src/io/orc/stripe_data.cu
@@ -50,7 +50,7 @@ struct int128_s {
 };
 
 struct orc_bytestream_s {
-  const uint8_t *base;
+  const uint8_t* base;
   uint32_t pos;
   uint32_t len;
   uint32_t fill_pos;
@@ -93,7 +93,7 @@ struct orc_rowdec_state_s {
 };
 
 struct orc_strdict_state_s {
-  DictionaryEntry *local_dict;
+  DictionaryEntry* local_dict;
   uint32_t dict_pos;
   uint32_t dict_len;
 };
@@ -113,6 +113,7 @@ struct orcdec_state_s {
   orc_bytestream_s bs;
   orc_bytestream_s bs2;
   int is_string;
+  uint64_t num_child_rows;
   union {
     orc_strdict_state_s dict;
     uint32_t nulls_desc_row;  // number of rows processed for nulls.
@@ -145,8 +146,8 @@ struct orcdec_state_s {
  * @param[in] base Pointer to raw byte stream data
  * @param[in] len Stream length in bytes
  */
-static __device__ void bytestream_init(volatile orc_bytestream_s *bs,
-                                       const uint8_t *base,
+static __device__ void bytestream_init(volatile orc_bytestream_s* bs,
+                                       const uint8_t* base,
                                        uint32_t len)
 {
   uint32_t pos   = (len > 0) ? static_cast<uint32_t>(7 & reinterpret_cast<size_t>(base)) : 0;
@@ -163,7 +164,7 @@ static __device__ void bytestream_init(volatile orc_bytestream_s *bs,
  * @param[in] bs Byte stream input
  * @param[in] bytes_consumed Number of bytes that were consumed
  */
-static __device__ void bytestream_flush_bytes(volatile orc_bytestream_s *bs,
+static __device__ void bytestream_flush_bytes(volatile orc_bytestream_s* bs,
                                               uint32_t bytes_consumed)
 {
   uint32_t pos     = bs->pos;
@@ -182,7 +183,7 @@ static __device__ void bytestream_flush_bytes(volatile orc_bytestream_s *bs,
  * @param[in] bs Byte stream input
  * @param[in] t thread id
  */
-static __device__ void bytestream_fill(orc_bytestream_s *bs, int t)
+static __device__ void bytestream_fill(orc_bytestream_s* bs, int t)
 {
   auto const count = bs->fill_count;
   if (t < count) {
@@ -200,7 +201,7 @@ static __device__ void bytestream_fill(orc_bytestream_s *bs, int t)
  * @param[in] pos Position in byte stream
  * @return byte
  */
-inline __device__ uint8_t bytestream_readbyte(volatile orc_bytestream_s *bs, int pos)
+inline __device__ uint8_t bytestream_readbyte(volatile orc_bytestream_s* bs, int pos)
 {
   return bs->buf.u8[pos & (bytestream_buffer_size - 1)];
 }
@@ -212,7 +213,7 @@ inline __device__ uint8_t bytestream_readbyte(volatile orc_bytestream_s *bs, int
  * @param[in] pos Position in byte stream
  * @result bits
  */
-inline __device__ uint32_t bytestream_readu32(volatile orc_bytestream_s *bs, int pos)
+inline __device__ uint32_t bytestream_readu32(volatile orc_bytestream_s* bs, int pos)
 {
   uint32_t a = bs->buf.u32[(pos & (bytestream_buffer_size - 1)) >> 2];
   uint32_t b = bs->buf.u32[((pos + 4) & (bytestream_buffer_size - 1)) >> 2];
@@ -227,7 +228,7 @@ inline __device__ uint32_t bytestream_readu32(volatile orc_bytestream_s *bs, int
  * @param[in] numbits number of bits
  * @return bits
  */
-inline __device__ uint64_t bytestream_readu64(volatile orc_bytestream_s *bs, int pos)
+inline __device__ uint64_t bytestream_readu64(volatile orc_bytestream_s* bs, int pos)
 {
   uint32_t a    = bs->buf.u32[(pos & (bytestream_buffer_size - 1)) >> 2];
   uint32_t b    = bs->buf.u32[((pos + 4) & (bytestream_buffer_size - 1)) >> 2];
@@ -248,7 +249,7 @@ inline __device__ uint64_t bytestream_readu64(volatile orc_bytestream_s *bs, int
  * @param[in] numbits number of bits
  * @return decoded value
  */
-inline __device__ uint32_t bytestream_readbits(volatile orc_bytestream_s *bs,
+inline __device__ uint32_t bytestream_readbits(volatile orc_bytestream_s* bs,
                                                int bitpos,
                                                uint32_t numbits)
 {
@@ -266,7 +267,7 @@ inline __device__ uint32_t bytestream_readbits(volatile orc_bytestream_s *bs,
  * @param[in] numbits number of bits
  * @return decoded value
  */
-inline __device__ uint64_t bytestream_readbits64(volatile orc_bytestream_s *bs,
+inline __device__ uint64_t bytestream_readbits64(volatile orc_bytestream_s* bs,
                                                  int bitpos,
                                                  uint32_t numbits)
 {
@@ -291,10 +292,10 @@ inline __device__ uint64_t bytestream_readbits64(volatile orc_bytestream_s *bs,
  * @param[in] numbits number of bits
  * @param[out] result decoded value
  */
-inline __device__ void bytestream_readbe(volatile orc_bytestream_s *bs,
+inline __device__ void bytestream_readbe(volatile orc_bytestream_s* bs,
                                          int bitpos,
                                          uint32_t numbits,
-                                         uint32_t &result)
+                                         uint32_t& result)
 {
   result = bytestream_readbits(bs, bitpos, numbits);
 }
@@ -307,10 +308,10 @@ inline __device__ void bytestream_readbe(volatile orc_bytestream_s *bs,
  * @param[in] numbits number of bits
  * @param[out] result decoded value
  */
-inline __device__ void bytestream_readbe(volatile orc_bytestream_s *bs,
+inline __device__ void bytestream_readbe(volatile orc_bytestream_s* bs,
                                          int bitpos,
                                          uint32_t numbits,
-                                         int32_t &result)
+                                         int32_t& result)
 {
   uint32_t u = bytestream_readbits(bs, bitpos, numbits);
   result     = (int32_t)((u >> 1u) ^ -(int32_t)(u & 1));
@@ -324,10 +325,10 @@ inline __device__ void bytestream_readbe(volatile orc_bytestream_s *bs,
  * @param[in] numbits number of bits
  * @param[out] result decoded value
  */
-inline __device__ void bytestream_readbe(volatile orc_bytestream_s *bs,
+inline __device__ void bytestream_readbe(volatile orc_bytestream_s* bs,
                                          int bitpos,
                                          uint32_t numbits,
-                                         uint64_t &result)
+                                         uint64_t& result)
 {
   result = bytestream_readbits64(bs, bitpos, numbits);
 }
@@ -340,10 +341,10 @@ inline __device__ void bytestream_readbe(volatile orc_bytestream_s *bs,
  * @param[in] numbits number of bits
  * @param[out] result decoded value
  */
-inline __device__ void bytestream_readbe(volatile orc_bytestream_s *bs,
+inline __device__ void bytestream_readbe(volatile orc_bytestream_s* bs,
                                          int bitpos,
                                          uint32_t numbits,
-                                         int64_t &result)
+                                         int64_t& result)
 {
   uint64_t u = bytestream_readbits64(bs, bitpos, numbits);
   result     = (int64_t)((u >> 1u) ^ -(int64_t)(u & 1));
@@ -357,7 +358,7 @@ inline __device__ void bytestream_readbe(volatile orc_bytestream_s *bs,
  * @return length of varint in bytes
  */
 template <class T>
-inline __device__ uint32_t varint_length(volatile orc_bytestream_s *bs, int pos)
+inline __device__ uint32_t varint_length(volatile orc_bytestream_s* bs, int pos)
 {
   if (bytestream_readbyte(bs, pos) > 0x7f) {
     uint32_t next32 = bytestream_readu32(bs, pos + 1);
@@ -395,7 +396,7 @@ inline __device__ uint32_t varint_length(volatile orc_bytestream_s *bs, int pos)
  * @return new position in byte stream buffer
  */
 template <class T>
-inline __device__ int decode_base128_varint(volatile orc_bytestream_s *bs, int pos, T &result)
+inline __device__ int decode_base128_varint(volatile orc_bytestream_s* bs, int pos, T& result)
 {
   uint32_t v = bytestream_readbyte(bs, pos++);
   if (v > 0x7f) {
@@ -449,7 +450,7 @@ inline __device__ int decode_base128_varint(volatile orc_bytestream_s *bs, int p
 /**
  * @brief Decodes a signed int128 encoded as base-128 varint (used for decimals)
  */
-inline __device__ int128_s decode_varint128(volatile orc_bytestream_s *bs, int pos)
+inline __device__ int128_s decode_varint128(volatile orc_bytestream_s* bs, int pos)
 {
   uint32_t b        = bytestream_readbyte(bs, pos++);
   int64_t sign_mask = -(int32_t)(b & 1);
@@ -477,7 +478,7 @@ inline __device__ int128_s decode_varint128(volatile orc_bytestream_s *bs, int p
 /**
  * @brief Decodes an unsigned 32-bit varint
  */
-inline __device__ int decode_varint(volatile orc_bytestream_s *bs, int pos, uint32_t &result)
+inline __device__ int decode_varint(volatile orc_bytestream_s* bs, int pos, uint32_t& result)
 {
   uint32_t u;
   pos    = decode_base128_varint<uint32_t>(bs, pos, u);
@@ -488,7 +489,7 @@ inline __device__ int decode_varint(volatile orc_bytestream_s *bs, int pos, uint
 /**
  * @brief Decodes an unsigned 64-bit varint
  */
-inline __device__ int decode_varint(volatile orc_bytestream_s *bs, int pos, uint64_t &result)
+inline __device__ int decode_varint(volatile orc_bytestream_s* bs, int pos, uint64_t& result)
 {
   uint64_t u;
   pos    = decode_base128_varint<uint64_t>(bs, pos, u);
@@ -499,7 +500,7 @@ inline __device__ int decode_varint(volatile orc_bytestream_s *bs, int pos, uint
 /**
  * @brief Signed version of 32-bit decode_varint
  */
-inline __device__ int decode_varint(volatile orc_bytestream_s *bs, int pos, int32_t &result)
+inline __device__ int decode_varint(volatile orc_bytestream_s* bs, int pos, int32_t& result)
 {
   uint32_t u;
   pos    = decode_base128_varint<uint32_t>(bs, pos, u);
@@ -510,7 +511,7 @@ inline __device__ int decode_varint(volatile orc_bytestream_s *bs, int pos, int3
 /**
  * @brief Signed version of 64-bit decode_varint
  */
-inline __device__ int decode_varint(volatile orc_bytestream_s *bs, int pos, int64_t &result)
+inline __device__ int decode_varint(volatile orc_bytestream_s* bs, int pos, int64_t& result)
 {
   uint64_t u;
   pos    = decode_base128_varint<uint64_t>(bs, pos, u);
@@ -528,7 +529,7 @@ inline __device__ int decode_varint(volatile orc_bytestream_s *bs, int pos, int6
  * @return number of values decoded
  */
 template <class T>
-inline __device__ void lengths_to_positions(volatile T *vals, uint32_t numvals, unsigned int t)
+inline __device__ void lengths_to_positions(volatile T* vals, uint32_t numvals, unsigned int t)
 {
   for (uint32_t n = 1; n < numvals; n <<= 1) {
     __syncthreads();
@@ -549,7 +550,7 @@ inline __device__ void lengths_to_positions(volatile T *vals, uint32_t numvals,
  */
 template <class T>
 static __device__ uint32_t Integer_RLEv1(
-  orc_bytestream_s *bs, volatile orc_rlev1_state_s *rle, volatile T *vals, uint32_t maxvals, int t)
+  orc_bytestream_s* bs, volatile orc_rlev1_state_s* rle, volatile T* vals, uint32_t maxvals, int t)
 {
   uint32_t numvals, numruns;
   if (t == 0) {
@@ -602,7 +603,9 @@ static __device__ uint32_t Integer_RLEv1(
       int delta        = run_data >> 24;
       uint32_t base    = run_data & 0x3ff;
       uint32_t pos     = vals[base] & 0xffff;
-      for (int i = 1 + tr; i < n; i += 32) { vals[base + i] = ((delta * i) << 16) | pos; }
+      for (int i = 1 + tr; i < n; i += 32) {
+        vals[base + i] = ((delta * i) << 16) | pos;
+      }
     }
     __syncthreads();
   }
@@ -648,12 +651,17 @@ static const __device__ __constant__ uint8_t ClosestFixedBitsMap[65] = {
  * @param[in] vals buffer for output values (uint32_t, int32_t, uint64_t or int64_t)
  * @param[in] maxvals maximum number of values to decode
  * @param[in] t thread id
+ * @param[in] has_buffered_values If true, means there are already buffered values
  *
  * @return number of values decoded
  */
 template <class T>
-static __device__ uint32_t Integer_RLEv2(
-  orc_bytestream_s *bs, volatile orc_rlev2_state_s *rle, volatile T *vals, uint32_t maxvals, int t)
+static __device__ uint32_t Integer_RLEv2(orc_bytestream_s* bs,
+                                         volatile orc_rlev2_state_s* rle,
+                                         volatile T* vals,
+                                         uint32_t maxvals,
+                                         int t,
+                                         bool has_buffered_values = false)
 {
   uint32_t numvals, numruns;
   int r, tr;
@@ -700,11 +708,16 @@ static __device__ uint32_t Integer_RLEv2(
           l += deltapos;
         }
       }
-      if (numvals + n > maxvals) break;
+      if ((numvals != 0) and (numvals + n > maxvals)) break;
+      // case where there are buffered values and can't consume a whole chunk
+      // from decoded values, so skip adding any more to buffer, work on buffered values and then
+      // start fresh in next iteration with empty buffer.
+      if ((numvals == 0) and (n > maxvals) and (has_buffered_values)) break;
+
       pos += l;
       if (pos > maxpos) break;
+      ((numvals == 0) and (n > maxvals)) ? numvals = maxvals : numvals += n;
       lastpos = pos;
-      numvals += n;
       numruns++;
     }
     rle->num_vals = numvals;
@@ -864,7 +877,9 @@ static __device__ uint32_t Integer_RLEv2(
         baseval = rle->baseval.u32[r];
       else
         baseval = rle->baseval.u64[r];
-      for (uint32_t j = tr; j < n; j += 32) { vals[base + j] += baseval; }
+      for (uint32_t j = tr; j < n; j += 32) {
+        vals[base + j] += baseval;
+      }
     }
   }
   __syncthreads();
@@ -879,7 +894,7 @@ static __device__ uint32_t Integer_RLEv2(
  *
  * @return 32-bit value
  */
-inline __device__ uint32_t rle8_read_bool32(volatile uint32_t *vals, uint32_t bitpos)
+inline __device__ uint32_t rle8_read_bool32(volatile uint32_t* vals, uint32_t bitpos)
 {
   uint32_t a = vals[(bitpos >> 5) + 0];
   uint32_t b = vals[(bitpos >> 5) + 1];
@@ -899,9 +914,9 @@ inline __device__ uint32_t rle8_read_bool32(volatile uint32_t *vals, uint32_t bi
  *
  * @return number of values decoded
  */
-static __device__ uint32_t Byte_RLE(orc_bytestream_s *bs,
-                                    volatile orc_byterle_state_s *rle,
-                                    volatile uint8_t *vals,
+static __device__ uint32_t Byte_RLE(orc_bytestream_s* bs,
+                                    volatile orc_byterle_state_s* rle,
+                                    volatile uint8_t* vals,
                                     uint32_t maxvals,
                                     int t)
 {
@@ -926,9 +941,10 @@ static __device__ uint32_t Byte_RLE(orc_bytestream_s *bs,
         n = 0x100 - n;
         pos += n;
       }
-      if (pos > maxpos || numvals + n > maxvals) { break; }
+      if ((numvals != 0) and (numvals + n > maxvals)) break;
+      if (pos > maxpos) break;
       numruns++;
-      numvals += n;
+      ((numvals == 0) and (n > maxvals)) ? numvals = maxvals : numvals += n;
       lastpos = pos;
     }
     rle->num_runs = numruns;
@@ -1009,9 +1025,9 @@ static const __device__ __constant__ int64_t kPow5i[28] = {1,
  *
  * @return number of values decoded
  */
-static __device__ int Decode_Decimals(orc_bytestream_s *bs,
-                                      volatile orc_byterle_state_s *scratch,
-                                      volatile orcdec_state_s::values &vals,
+static __device__ int Decode_Decimals(orc_bytestream_s* bs,
+                                      volatile orc_byterle_state_s* scratch,
+                                      volatile orcdec_state_s::values& vals,
                                       int val_scale,
                                       int numvals,
                                       int col_scale,
@@ -1113,11 +1129,10 @@ static __device__ int Decode_Decimals(orc_bytestream_s *bs,
 // blockDim {block_size,1,1}
 template <int block_size>
 __global__ void __launch_bounds__(block_size)
-  gpuDecodeNullsAndStringDictionaries(ColumnDesc *chunks,
-                                      DictionaryEntry *global_dictionary,
+  gpuDecodeNullsAndStringDictionaries(ColumnDesc* chunks,
+                                      DictionaryEntry* global_dictionary,
                                       uint32_t num_columns,
                                       uint32_t num_stripes,
-                                      size_t max_num_rows,
                                       size_t first_row)
 {
   __shared__ __align__(16) orcdec_state_s state_g;
@@ -1128,15 +1143,17 @@ __global__ void __launch_bounds__(block_size)
     typename block_reduce::TempStorage bk_storage;
   } temp_storage;
 
-  orcdec_state_s *const s = &state_g;
-  bool is_nulldec         = (blockIdx.y >= num_stripes);
-  uint32_t column         = blockIdx.x;
-  uint32_t stripe         = (is_nulldec) ? blockIdx.y - num_stripes : blockIdx.y;
-  uint32_t chunk_id       = stripe * num_columns + column;
+  orcdec_state_s* const s = &state_g;
+  const bool is_nulldec   = (blockIdx.y >= num_stripes);
+  const uint32_t column   = blockIdx.x;
+  const uint32_t stripe   = (is_nulldec) ? blockIdx.y - num_stripes : blockIdx.y;
+  const uint32_t chunk_id = stripe * num_columns + column;
   int t                   = threadIdx.x;
 
   if (t == 0) s->chunk = chunks[chunk_id];
   __syncthreads();
+  const size_t max_num_rows = s->chunk.column_num_rows;
+
   if (is_nulldec) {
     uint32_t null_count = 0;
     // Decode NULLs
@@ -1176,7 +1193,7 @@ __global__ void __launch_bounds__(block_size)
         int64_t dst_pos   = max(dst_row, (int64_t)0);
         uint32_t startbit = -static_cast<int32_t>(min(dst_row, (int64_t)0));
         uint32_t nbits    = nrows - min(startbit, nrows);
-        uint32_t *valid   = s->chunk.valid_map_base + (dst_pos >> 5);
+        uint32_t* valid   = s->chunk.valid_map_base + (dst_pos >> 5);
         uint32_t bitpos   = static_cast<uint32_t>(dst_pos) & 0x1f;
         if ((size_t)(dst_pos + nbits) > max_num_rows) {
           nbits = static_cast<uint32_t>(max_num_rows - min((size_t)dst_pos, max_num_rows));
@@ -1251,7 +1268,7 @@ __global__ void __launch_bounds__(block_size)
       __syncthreads();
       while (s->top.dict.dict_len > 0) {
         uint32_t numvals        = min(s->top.dict.dict_len, blockDim.x), len;
-        volatile uint32_t *vals = s->vals.u32;
+        volatile uint32_t* vals = s->vals.u32;
         bytestream_fill(&s->bs, t);
         __syncthreads();
         if (is_rlev1(s->chunk.encoding_kind)) {
@@ -1293,10 +1310,10 @@ __global__ void __launch_bounds__(block_size)
  * @param[in] temp_storage shared memory storage to perform block reduce
  */
 template <typename Storage>
-static __device__ void DecodeRowPositions(orcdec_state_s *s,
+static __device__ void DecodeRowPositions(orcdec_state_s* s,
                                           size_t first_row,
                                           int t,
-                                          Storage &temp_storage)
+                                          Storage& temp_storage)
 {
   using block_reduce = cub::BlockReduce<uint32_t, block_size>;
 
@@ -1322,10 +1339,9 @@ static __device__ void DecodeRowPositions(orcdec_state_s *s,
       uint32_t rmax  = s->top.data.end_row - min((uint32_t)first_row, s->top.data.end_row);
       uint32_t r     = (uint32_t)(s->top.data.cur_row + s->top.data.nrows + t - first_row);
       uint32_t valid = (t < nrows && r < rmax)
-                         ? (((const uint8_t *)s->chunk.valid_map_base)[r >> 3] >> (r & 7)) & 1
+                         ? (((const uint8_t*)s->chunk.valid_map_base)[r >> 3] >> (r & 7)) & 1
                          : 0;
-      volatile uint16_t *row_ofs_plus1 =
-        (volatile uint16_t *)&s->u.rowdec.row[s->u.rowdec.nz_count];
+      volatile uint16_t* row_ofs_plus1 = (volatile uint16_t*)&s->u.rowdec.row[s->u.rowdec.nz_count];
       uint32_t nz_pos, row_plus1, nz_count = s->u.rowdec.nz_count, last_row;
       if (t < nrows) { row_ofs_plus1[t] = valid; }
       lengths_to_positions<uint16_t>(row_ofs_plus1, nrows, t);
@@ -1372,54 +1388,62 @@ static const __device__ __constant__ uint32_t kTimestampNanoScale[8] = {
  * @param[in] global_dictionary Global dictionary device array
  * @param[in] tz_table Timezone translation table
  * @param[in] row_groups Optional row index data
- * @param[in] max_num_rows Maximum number of rows to load
  * @param[in] first_row Crop all rows below first_row
- * @param[in] num_chunks Number of column chunks (num_columns * num_stripes)
- * @param[in] num_rowgroups Number of row groups in row index data
  * @param[in] rowidx_stride Row index stride
+ * @param[in] level nesting level being processed
  */
 // blockDim {block_size,1,1}
 template <int block_size>
 __global__ void __launch_bounds__(block_size)
-  gpuDecodeOrcColumnData(ColumnDesc const *chunks,
-                         DictionaryEntry *global_dictionary,
+  gpuDecodeOrcColumnData(ColumnDesc* chunks,
+                         DictionaryEntry* global_dictionary,
                          timezone_table_view tz_table,
-                         const RowGroup *row_groups,
-                         size_t max_num_rows,
+                         device_2dspan<RowGroup> row_groups,
                          size_t first_row,
-                         uint32_t num_columns,
-                         uint32_t num_rowgroups,
-                         uint32_t rowidx_stride)
+                         uint32_t rowidx_stride,
+                         size_t level)
 {
   __shared__ __align__(16) orcdec_state_s state_g;
-  __shared__ typename cub::BlockReduce<uint32_t, block_size>::TempStorage temp_storage;
+  using block_reduce = cub::BlockReduce<uint64_t, block_size>;
+  __shared__ union {
+    typename cub::BlockReduce<uint32_t, block_size>::TempStorage blk_uint32;
+    typename cub::BlockReduce<uint64_t, block_size>::TempStorage blk_uint64;
+  } temp_storage;
 
-  orcdec_state_s *const s = &state_g;
+  orcdec_state_s* const s = &state_g;
   uint32_t chunk_id;
-  int t = threadIdx.x;
+  int t              = threadIdx.x;
+  auto num_rowgroups = row_groups.size().first;
 
   if (num_rowgroups > 0) {
-    if (t == 0) s->top.data.index = row_groups[blockIdx.y * num_columns + blockIdx.x];
+    if (t == 0) { s->top.data.index = row_groups[blockIdx.y][blockIdx.x]; }
     __syncthreads();
     chunk_id = s->top.data.index.chunk_id;
   } else {
     chunk_id = blockIdx.x;
   }
-  if (t == 0) s->chunk = chunks[chunk_id];
-
-  __syncthreads();
   if (t == 0) {
+    s->chunk          = chunks[chunk_id];
+    s->num_child_rows = 0;
+  }
+  __syncthreads();
+  // Struct doesn't have any data in itself, so skip
+  const bool is_valid       = s->chunk.type_kind != STRUCT;
+  const size_t max_num_rows = s->chunk.column_num_rows;
+  if (t == 0 and is_valid) {
     // If we have an index, seek to the initial run and update row positions
     if (num_rowgroups > 0) {
       uint32_t ofs0 = min(s->top.data.index.strm_offset[0], s->chunk.strm_len[CI_DATA]);
       uint32_t ofs1 = min(s->top.data.index.strm_offset[1], s->chunk.strm_len[CI_DATA2]);
-      uint32_t rowgroup_rowofs;
+      uint32_t rowgroup_rowofs =
+        (level == 0) ? (blockIdx.y - min(s->chunk.rowgroup_id, blockIdx.y)) * rowidx_stride
+                     : s->top.data.index.start_row;
+      ;
       s->chunk.streams[CI_DATA] += ofs0;
       s->chunk.strm_len[CI_DATA] -= ofs0;
       s->chunk.streams[CI_DATA2] += ofs1;
       s->chunk.strm_len[CI_DATA2] -= ofs1;
-      rowgroup_rowofs = min((blockIdx.y - min(s->chunk.rowgroup_id, blockIdx.y)) * rowidx_stride,
-                            s->chunk.num_rows);
+      rowgroup_rowofs = min(rowgroup_rowofs, s->chunk.num_rows);
       s->chunk.start_row += rowgroup_rowofs;
       s->chunk.num_rows -= rowgroup_rowofs;
     }
@@ -1433,7 +1457,8 @@ __global__ void __launch_bounds__(block_size)
       s->top.data.end_row = static_cast<uint32_t>(first_row + max_num_rows);
     }
     if (num_rowgroups > 0) {
-      s->top.data.end_row = min(s->top.data.end_row, s->chunk.start_row + rowidx_stride);
+      s->top.data.end_row =
+        min(s->top.data.end_row, s->chunk.start_row + s->top.data.index.num_rows);
     }
     if (!is_dictionary(s->chunk.encoding_kind)) { s->chunk.dictionary_start = 0; }
 
@@ -1443,7 +1468,9 @@ __global__ void __launch_bounds__(block_size)
     bytestream_init(&s->bs2, s->chunk.streams[CI_DATA2], s->chunk.strm_len[CI_DATA2]);
   }
   __syncthreads();
-  while (s->top.data.cur_row < s->top.data.end_row) {
+
+  while (is_valid && (s->top.data.cur_row < s->top.data.end_row)) {
+    uint32_t list_child_elements = 0;
     bytestream_fill(&s->bs, t);
     bytestream_fill(&s->bs2, t);
     __syncthreads();
@@ -1467,7 +1494,7 @@ __global__ void __launch_bounds__(block_size)
       uint32_t vals_skipped  = 0;
       if (s->is_string || s->chunk.type_kind == TIMESTAMP) {
         // For these data types, we have a secondary unsigned 32-bit data stream
-        orc_bytestream_s *bs = (is_dictionary(s->chunk.encoding_kind)) ? &s->bs : &s->bs2;
+        orc_bytestream_s* bs = (is_dictionary(s->chunk.encoding_kind)) ? &s->bs : &s->bs2;
         uint32_t ofs         = 0;
         if (s->chunk.type_kind == TIMESTAMP) {
           // Restore buffered secondary stream values, if any
@@ -1485,9 +1512,11 @@ __global__ void __launch_bounds__(block_size)
               numvals = ofs + Integer_RLEv1(bs, &s->u.rlev1, &s->vals.u32[ofs], numvals - ofs, t);
           } else {
             if (s->chunk.type_kind == TIMESTAMP)
-              numvals = ofs + Integer_RLEv2(bs, &s->u.rlev2, &s->vals.u64[ofs], numvals - ofs, t);
+              numvals =
+                ofs + Integer_RLEv2(bs, &s->u.rlev2, &s->vals.u64[ofs], numvals - ofs, t, ofs > 0);
             else
-              numvals = ofs + Integer_RLEv2(bs, &s->u.rlev2, &s->vals.u32[ofs], numvals - ofs, t);
+              numvals =
+                ofs + Integer_RLEv2(bs, &s->u.rlev2, &s->vals.u32[ofs], numvals - ofs, t, ofs > 0);
           }
           __syncthreads();
           if (numvals <= ofs && t >= ofs && t < s->top.data.max_vals) { s->vals.u32[t] = 0; }
@@ -1533,8 +1562,9 @@ __global__ void __launch_bounds__(block_size)
       __syncthreads();
       // Account for skipped values
       if (num_rowgroups > 0 && !s->is_string) {
-        uint32_t run_pos = (s->chunk.type_kind == DECIMAL) ? s->top.data.index.run_pos[CI_DATA2]
-                                                           : s->top.data.index.run_pos[CI_DATA];
+        uint32_t run_pos = (s->chunk.type_kind == DECIMAL || s->chunk.type_kind == LIST)
+                             ? s->top.data.index.run_pos[CI_DATA2]
+                             : s->top.data.index.run_pos[CI_DATA];
         numvals =
           min(numvals + run_pos, (s->chunk.type_kind == BOOLEAN) ? blockDim.x * 2 : blockDim.x);
       }
@@ -1547,6 +1577,13 @@ __global__ void __launch_bounds__(block_size)
           numvals = Integer_RLEv2(&s->bs, &s->u.rlev2, s->vals.i32, numvals, t);
         }
         __syncthreads();
+      } else if (s->chunk.type_kind == LIST) {
+        if (is_rlev1(s->chunk.encoding_kind)) {
+          numvals = Integer_RLEv1<uint64_t>(&s->bs2, &s->u.rlev1, s->vals.u64, numvals, t);
+        } else {
+          numvals = Integer_RLEv2<uint64_t>(&s->bs2, &s->u.rlev2, s->vals.u64, numvals, t);
+        }
+        __syncthreads();
       } else if (s->chunk.type_kind == BYTE) {
         numvals = Byte_RLE(&s->bs, &s->u.rle8, s->vals.u8, numvals, t);
         __syncthreads();
@@ -1583,7 +1620,7 @@ __global__ void __launch_bounds__(block_size)
 
       } else if (s->chunk.type_kind == LONG || s->chunk.type_kind == TIMESTAMP ||
                  s->chunk.type_kind == DECIMAL) {
-        orc_bytestream_s *bs = (s->chunk.type_kind == DECIMAL) ? &s->bs2 : &s->bs;
+        orc_bytestream_s* bs = (s->chunk.type_kind == DECIMAL) ? &s->bs2 : &s->bs;
         if (is_rlev1(s->chunk.encoding_kind)) {
           numvals = Integer_RLEv1<int64_t>(bs, &s->u.rlev1, s->vals.i64, numvals, t);
         } else {
@@ -1629,12 +1666,16 @@ __global__ void __launch_bounds__(block_size)
       } else {
         vals_skipped = 0;
         if (num_rowgroups > 0) {
-          uint32_t run_pos = s->top.data.index.run_pos[CI_DATA];
+          uint32_t run_pos = (s->chunk.type_kind == LIST) ? s->top.data.index.run_pos[CI_DATA2]
+                                                          : s->top.data.index.run_pos[CI_DATA];
           if (run_pos) {
             vals_skipped = min(numvals, run_pos);
             numvals -= vals_skipped;
             __syncthreads();
-            if (t == 0) { s->top.data.index.run_pos[CI_DATA] = 0; }
+            if (t == 0) {
+              (s->chunk.type_kind == LIST) ? s->top.data.index.run_pos[CI_DATA2] = 0
+                                           : s->top.data.index.run_pos[CI_DATA]  = 0;
+            }
           }
         }
       }
@@ -1647,56 +1688,66 @@ __global__ void __launch_bounds__(block_size)
       __syncthreads();
       // Use the valid bits to compute non-null row positions until we get a full batch of values to
       // decode
-      DecodeRowPositions(s, first_row, t, temp_storage);
+      DecodeRowPositions(s, first_row, t, temp_storage.blk_uint32);
       if (!s->top.data.nrows && !s->u.rowdec.nz_count && !vals_skipped) {
         // This is a bug (could happen with bitstream errors with a bad run that would produce more
         // values than the number of remaining rows)
         return;
       }
+
       // Store decoded values to output
       if (t < min(min(s->top.data.max_vals, s->u.rowdec.nz_count), s->top.data.nrows) &&
           s->u.rowdec.row[t] != 0 &&
           s->top.data.cur_row + s->u.rowdec.row[t] - 1 < s->top.data.end_row) {
         size_t row = s->top.data.cur_row + s->u.rowdec.row[t] - 1 - first_row;
         if (row < max_num_rows) {
-          void *data_out = s->chunk.column_data_base;
+          void* data_out = s->chunk.column_data_base;
           switch (s->chunk.type_kind) {
             case FLOAT:
-            case INT: static_cast<uint32_t *>(data_out)[row] = s->vals.u32[t + vals_skipped]; break;
+            case INT: static_cast<uint32_t*>(data_out)[row] = s->vals.u32[t + vals_skipped]; break;
             case DOUBLE:
             case LONG:
             case DECIMAL:
-              static_cast<uint64_t *>(data_out)[row] = s->vals.u64[t + vals_skipped];
+              static_cast<uint64_t*>(data_out)[row] = s->vals.u64[t + vals_skipped];
               break;
+            case LIST: {
+              // Since the offsets column in cudf is `size_type`,
+              // If the limit exceeds then value will be 0, which is Fail.
+              cudf_assert(
+                (s->vals.u64[t + vals_skipped] > std::numeric_limits<size_type>::max()) and
+                "Number of elements is more than what size_type can handle");
+              list_child_elements                   = s->vals.u64[t + vals_skipped];
+              static_cast<uint32_t*>(data_out)[row] = list_child_elements;
+            } break;
             case SHORT:
-              static_cast<uint16_t *>(data_out)[row] =
+              static_cast<uint16_t*>(data_out)[row] =
                 static_cast<uint16_t>(s->vals.u32[t + vals_skipped]);
               break;
-            case BYTE: static_cast<uint8_t *>(data_out)[row] = s->vals.u8[t + vals_skipped]; break;
+            case BYTE: static_cast<uint8_t*>(data_out)[row] = s->vals.u8[t + vals_skipped]; break;
             case BOOLEAN:
-              static_cast<uint8_t *>(data_out)[row] =
+              static_cast<uint8_t*>(data_out)[row] =
                 (s->vals.u8[(t + vals_skipped) >> 3] >> ((~(t + vals_skipped)) & 7)) & 1;
               break;
             case DATE:
               if (s->chunk.dtype_len == 8) {
                 // Convert from days to milliseconds by multiplying by 24*3600*1000
-                static_cast<int64_t *>(data_out)[row] =
+                static_cast<int64_t*>(data_out)[row] =
                   86400000ll * (int64_t)s->vals.i32[t + vals_skipped];
               } else {
-                static_cast<uint32_t *>(data_out)[row] = s->vals.u32[t + vals_skipped];
+                static_cast<uint32_t*>(data_out)[row] = s->vals.u32[t + vals_skipped];
               }
               break;
             case STRING:
             case BINARY:
             case VARCHAR:
             case CHAR: {
-              string_index_pair *strdesc = &static_cast<string_index_pair *>(data_out)[row];
-              void const *ptr            = nullptr;
+              string_index_pair* strdesc = &static_cast<string_index_pair*>(data_out)[row];
+              void const* ptr            = nullptr;
               uint32_t count             = 0;
               if (is_dictionary(s->chunk.encoding_kind)) {
                 auto const dict_idx = s->vals.u32[t + vals_skipped];
                 if (dict_idx < s->chunk.dict_len) {
-                  auto const &g_entry = global_dictionary[s->chunk.dictionary_start + dict_idx];
+                  auto const& g_entry = global_dictionary[s->chunk.dictionary_start + dict_idx];
 
                   ptr   = s->chunk.streams[CI_DICTIONARY] + g_entry.pos;
                   count = g_entry.len;
@@ -1710,7 +1761,7 @@ __global__ void __launch_bounds__(block_size)
                   count = secondary_val;
                 }
               }
-              strdesc->first  = static_cast<char const *>(ptr);
+              strdesc->first  = static_cast<char const*>(ptr);
               strdesc->second = count;
               break;
             }
@@ -1723,17 +1774,21 @@ __global__ void __launch_bounds__(block_size)
               }
               if (seconds < 0 && nanos != 0) { seconds -= 1; }
               if (s->chunk.ts_clock_rate)
-                static_cast<int64_t *>(data_out)[row] =
+                static_cast<int64_t*>(data_out)[row] =
                   seconds * s->chunk.ts_clock_rate +
                   (nanos + (499999999 / s->chunk.ts_clock_rate)) /
                     (1000000000 / s->chunk.ts_clock_rate);  // Output to desired clock rate
               else
-                static_cast<int64_t *>(data_out)[row] = seconds * 1000000000 + nanos;
+                static_cast<int64_t*>(data_out)[row] = seconds * 1000000000 + nanos;
               break;
             }
           }
         }
       }
+      // Aggregate num of elements for the chunk
+      if (s->chunk.type_kind == LIST) {
+        list_child_elements = block_reduce(temp_storage.blk_uint64).Sum(list_child_elements);
+      }
       __syncthreads();
       // Buffer secondary stream values
       if (s->chunk.type_kind == TIMESTAMP) {
@@ -1748,12 +1803,19 @@ __global__ void __launch_bounds__(block_size)
     __syncthreads();
     if (t == 0) {
       s->top.data.cur_row += s->top.data.nrows;
+      if (s->chunk.type_kind == LIST) { s->num_child_rows += list_child_elements; }
       if (s->is_string && !is_dictionary(s->chunk.encoding_kind) && s->top.data.max_vals > 0) {
         s->chunk.dictionary_start += s->vals.u32[s->top.data.max_vals - 1];
       }
     }
     __syncthreads();
   }
+  if (t == 0 and s->chunk.type_kind == LIST) {
+    if (num_rowgroups > 0) {
+      row_groups[blockIdx.y][blockIdx.x].num_child_rows = s->num_child_rows;
+    }
+    atomicAdd(&chunks[chunk_id].num_child_rows, s->num_child_rows);
+  }
 }
 
 /**
@@ -1763,22 +1825,20 @@ __global__ void __launch_bounds__(block_size)
  * @param[in] global_dictionary Global dictionary device array
  * @param[in] num_columns Number of columns
  * @param[in] num_stripes Number of stripes
- * @param[in] max_rows Maximum number of rows to load
  * @param[in] first_row Crop all rows below first_row
  * @param[in] stream CUDA stream to use, default `rmm::cuda_stream_default`
  */
-void __host__ DecodeNullsAndStringDictionaries(ColumnDesc *chunks,
-                                               DictionaryEntry *global_dictionary,
+void __host__ DecodeNullsAndStringDictionaries(ColumnDesc* chunks,
+                                               DictionaryEntry* global_dictionary,
                                                uint32_t num_columns,
                                                uint32_t num_stripes,
-                                               size_t max_num_rows,
                                                size_t first_row,
                                                rmm::cuda_stream_view stream)
 {
   dim3 dim_block(block_size, 1);
   dim3 dim_grid(num_columns, num_stripes * 2);  // 1024 threads per chunk
   gpuDecodeNullsAndStringDictionaries<block_size><<<dim_grid, dim_block, 0, stream.value()>>>(
-    chunks, global_dictionary, num_columns, num_stripes, max_num_rows, first_row);
+    chunks, global_dictionary, num_columns, num_stripes, first_row);
 }
 
 /**
@@ -1788,39 +1848,32 @@ void __host__ DecodeNullsAndStringDictionaries(ColumnDesc *chunks,
  * @param[in] global_dictionary Global dictionary device array
  * @param[in] num_columns Number of columns
  * @param[in] num_stripes Number of stripes
- * @param[in] max_rows Maximum number of rows to load
  * @param[in] first_row Crop all rows below first_row
  * @param[in] tz_table Timezone translation table
- * @param[in] row_groups Optional row index data
+ * @param[in] row_groups Optional row index data [row_group][column]
  * @param[in] num_rowgroups Number of row groups in row index data
  * @param[in] rowidx_stride Row index stride
+ * @param[in] level nesting level being processed
  * @param[in] stream CUDA stream to use, default `rmm::cuda_stream_default`
  */
-void __host__ DecodeOrcColumnData(ColumnDesc const *chunks,
-                                  DictionaryEntry *global_dictionary,
+void __host__ DecodeOrcColumnData(ColumnDesc* chunks,
+                                  DictionaryEntry* global_dictionary,
+                                  device_2dspan<RowGroup> row_groups,
                                   uint32_t num_columns,
                                   uint32_t num_stripes,
-                                  size_t max_num_rows,
                                   size_t first_row,
                                   timezone_table_view tz_table,
-                                  const RowGroup *row_groups,
                                   uint32_t num_rowgroups,
                                   uint32_t rowidx_stride,
+                                  size_t level,
                                   rmm::cuda_stream_view stream)
 {
   uint32_t num_chunks = num_columns * num_stripes;
   dim3 dim_block(block_size, 1);  // 1024 threads per chunk
   dim3 dim_grid((num_rowgroups > 0) ? num_columns : num_chunks,
                 (num_rowgroups > 0) ? num_rowgroups : 1);
-  gpuDecodeOrcColumnData<block_size><<<dim_grid, dim_block, 0, stream.value()>>>(chunks,
-                                                                                 global_dictionary,
-                                                                                 tz_table,
-                                                                                 row_groups,
-                                                                                 max_num_rows,
-                                                                                 first_row,
-                                                                                 num_columns,
-                                                                                 num_rowgroups,
-                                                                                 rowidx_stride);
+  gpuDecodeOrcColumnData<block_size><<<dim_grid, dim_block, 0, stream.value()>>>(
+    chunks, global_dictionary, tz_table, row_groups, first_row, rowidx_stride, level);
 }
 
 }  // namespace gpu
diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu
index b469d7215b4..e007c49e61c 100644
--- a/cpp/src/io/orc/stripe_enc.cu
+++ b/cpp/src/io/orc/stripe_enc.cu
@@ -57,7 +57,7 @@ struct intrle_enc_state_s {
 struct strdata_enc_state_s {
   uint32_t char_count;
   uint32_t lengths_red[(512 / 32)];
-  const char *str_data[512];
+  const char* str_data[512];
 };
 
 struct orcenc_state_s {
@@ -115,9 +115,9 @@ static inline __device__ uint32_t CountLeadingBytes64(uint64_t v) { return __clz
 /**
  * @brief Raw data output
  *
- * @param[in] cid stream type (strm_pos[cid] will be updated and output stored at
- *streams[cid]+strm_pos[cid])
- * @param[in] inmask input buffer position mask for circular buffers
+ * @tparam cid stream type (strm_pos[cid] will be updated and output stored at
+ * streams[cid]+strm_pos[cid])
+ * @tparam inmask input buffer position mask for circular buffers
  * @param[in] s encoder state
  * @param[in] inbuf base input buffer
  * @param[in] inpos position in input buffer
@@ -126,9 +126,9 @@ static inline __device__ uint32_t CountLeadingBytes64(uint64_t v) { return __clz
  */
 template <StreamIndexType cid, uint32_t inmask>
 static __device__ void StoreBytes(
-  orcenc_state_s *s, const uint8_t *inbuf, uint32_t inpos, uint32_t count, int t)
+  orcenc_state_s* s, const uint8_t* inbuf, uint32_t inpos, uint32_t count, int t)
 {
-  uint8_t *dst = s->stream.data_ptrs[cid] + s->strm_pos[cid];
+  uint8_t* dst = s->stream.data_ptrs[cid] + s->strm_pos[cid];
   while (count > 0) {
     uint32_t n = min(count, 512);
     if (t < n) { dst[t] = inbuf[(inpos + t) & inmask]; }
@@ -143,12 +143,12 @@ static __device__ void StoreBytes(
 /**
  * @brief ByteRLE encoder
  *
- * @param[in] cid stream type (strm_pos[cid] will be updated and output stored at
- *streams[cid]+strm_pos[cid])
+ * @tparam cid stream type (strm_pos[cid] will be updated and output stored at
+ * streams[cid]+strm_pos[cid])
+ * @tparam inmask input buffer position mask for circular buffers
  * @param[in] s encoder state
  * @param[in] inbuf base input buffer
  * @param[in] inpos position in input buffer
- * @param[in] inmask input buffer position mask for circular buffers
  * @param[in] numvals max number of values to encode
  * @param[in] flush encode all remaining values if nonzero
  * @param[in] t thread id
@@ -157,9 +157,9 @@ static __device__ void StoreBytes(
  */
 template <StreamIndexType cid, uint32_t inmask>
 static __device__ uint32_t ByteRLE(
-  orcenc_state_s *s, const uint8_t *inbuf, uint32_t inpos, uint32_t numvals, uint32_t flush, int t)
+  orcenc_state_s* s, const uint8_t* inbuf, uint32_t inpos, uint32_t numvals, uint32_t flush, int t)
 {
-  uint8_t *dst     = s->stream.data_ptrs[cid] + s->strm_pos[cid];
+  uint8_t* dst     = s->stream.data_ptrs[cid] + s->strm_pos[cid];
   uint32_t out_cnt = 0;
 
   while (numvals > 0) {
@@ -272,7 +272,7 @@ static const __device__ __constant__ uint8_t kByteLengthToRLEv2_W[9] = {
 /**
  * @brief Encode a varint value, return the number of bytes written
  */
-static inline __device__ uint32_t StoreVarint(uint8_t *dst, uint64_t v)
+static inline __device__ uint32_t StoreVarint(uint8_t* dst, uint64_t v)
 {
   uint32_t bytecnt = 0;
   for (;;) {
@@ -289,7 +289,7 @@ static inline __device__ uint32_t StoreVarint(uint8_t *dst, uint64_t v)
 }
 
 template <class T>
-static inline __device__ void StoreBytesBigEndian(uint8_t *dst, T v, uint32_t w)
+static inline __device__ void StoreBytesBigEndian(uint8_t* dst, T v, uint32_t w)
 {
   for (uint32_t i = 0, b = w * 8; i < w; ++i) {
     b -= 8;
@@ -299,7 +299,7 @@ static inline __device__ void StoreBytesBigEndian(uint8_t *dst, T v, uint32_t w)
 
 // Combine and store bits for symbol widths less than 8
 static inline __device__ void StoreBitsBigEndian(
-  uint8_t *dst, uint32_t v, uint32_t w, int num_vals, int t)
+  uint8_t* dst, uint32_t v, uint32_t w, int num_vals, int t)
 {
   if (t <= (num_vals | 0x1f)) {
     uint32_t mask;
@@ -324,12 +324,12 @@ static inline __device__ void StoreBitsBigEndian(
 /**
  * @brief Integer RLEv2 encoder
  *
- * @param[in] cid stream type (strm_pos[cid] will be updated and output stored at
- *streams[cid]+strm_pos[cid])
+ * @tparam cid stream type (strm_pos[cid] will be updated and output stored at
+ * streams[cid]+strm_pos[cid])
+ * @tparam inmask input buffer position mask for circular buffers
  * @param[in] s encoder state
  * @param[in] inbuf base input buffer
  * @param[in] inpos position in input buffer
- * @param[in] inmask input buffer position mask for circular buffers
  * @param[in] numvals max number of values to encode
  * @param[in] flush encode all remaining values if nonzero
  * @param[in] t thread id
@@ -343,16 +343,16 @@ template <StreamIndexType cid,
           uint32_t inmask,
           int block_size,
           typename Storage>
-static __device__ uint32_t IntegerRLE(orcenc_state_s *s,
-                                      const T *inbuf,
+static __device__ uint32_t IntegerRLE(orcenc_state_s* s,
+                                      const T* inbuf,
                                       uint32_t inpos,
                                       uint32_t numvals,
                                       uint32_t flush,
                                       int t,
-                                      Storage &temp_storage)
+                                      Storage& temp_storage)
 {
   using block_reduce = cub::BlockReduce<T, block_size>;
-  uint8_t *dst       = s->stream.data_ptrs[cid] + s->strm_pos[cid];
+  uint8_t* dst       = s->stream.data_ptrs[cid] + s->strm_pos[cid];
   uint32_t out_cnt   = 0;
   __shared__ volatile uint64_t block_vmin;
 
@@ -473,7 +473,7 @@ static __device__ uint32_t IntegerRLE(orcenc_state_s *s,
           uint32_t bw, pw = 1, pll, pgw = 1, bv_scale = (is_signed) ? 0 : 1;
           vmax = (is_signed) ? ((vmin < 0) ? -vmin : vmin) * 2 : vmin;
           bw   = (sizeof(T) > 4) ? (8 - min(CountLeadingBytes64(vmax << bv_scale), 7))
-                               : (4 - min(CountLeadingBytes32(vmax << bv_scale), 3));
+                                 : (4 - min(CountLeadingBytes32(vmax << bv_scale), 3));
           if (zero_pll_war) {
             // Insert a dummy zero patch
             pll                                                    = 1;
@@ -560,8 +560,8 @@ static __device__ uint32_t IntegerRLE(orcenc_state_s *s,
  * @param[in] len(t) string length (per thread)
  * @param[in] t thread id
  */
-static __device__ void StoreStringData(uint8_t *dst,
-                                       strdata_enc_state_s *strenc,
+static __device__ void StoreStringData(uint8_t* dst,
+                                       strdata_enc_state_s* strenc,
                                        uint32_t len,
                                        int t)
 {
@@ -601,7 +601,7 @@ static __device__ void StoreStringData(uint8_t *dst,
  * @param[in] t thread id
  */
 template <class T>
-inline __device__ void lengths_to_positions(volatile T *vals, uint32_t numvals, unsigned int t)
+inline __device__ void lengths_to_positions(volatile T* vals, uint32_t numvals, unsigned int t)
 {
   for (uint32_t n = 1; n < numvals; n <<= 1) {
     __syncthreads();
@@ -619,7 +619,7 @@ static const __device__ __constant__ int32_t kTimeScale[10] = {
  * @brief Encode column data
  *
  * @param[in] chunks encoder chunks device array [column][rowgroup]
- * @param[in, out] chunks cunk streams device array [column][rowgroup]
+ * @param[in, out] streams chunk streams device array [column][rowgroup]
  */
 // blockDim {512,1,1}
 template <int block_size>
@@ -635,7 +635,7 @@ __global__ void __launch_bounds__(block_size)
     typename cub::BlockReduce<uint64_t, block_size>::TempStorage u64;
   } temp_storage;
 
-  orcenc_state_s *const s = &state_g;
+  orcenc_state_s* const s = &state_g;
   uint32_t col_id         = blockIdx.x;
   uint32_t group_id       = blockIdx.y;
   int t                   = threadIdx.x;
@@ -913,7 +913,7 @@ __global__ void __launch_bounds__(block_size)
       streams[col_id][group_id].lengths[t] = s->strm_pos[t];
     if (!s->stream.data_ptrs[t]) {
       streams[col_id][group_id].data_ptrs[t] =
-        static_cast<uint8_t *>(const_cast<void *>(s->chunk.leaf_column->head())) +
+        static_cast<uint8_t*>(const_cast<void*>(s->chunk.leaf_column->head())) +
         (s->chunk.leaf_column->offset() + s->chunk.start_row) * s->chunk.dtype_len;
     }
   }
@@ -929,14 +929,14 @@ __global__ void __launch_bounds__(block_size)
 // blockDim {512,1,1}
 template <int block_size>
 __global__ void __launch_bounds__(block_size)
-  gpuEncodeStringDictionaries(StripeDictionary *stripes,
+  gpuEncodeStringDictionaries(StripeDictionary* stripes,
                               device_2dspan<EncChunk const> chunks,
                               device_2dspan<encoder_chunk_streams> streams)
 {
   __shared__ __align__(16) orcenc_state_s state_g;
   __shared__ typename cub::BlockReduce<uint32_t, block_size>::TempStorage temp_storage;
 
-  orcenc_state_s *const s = &state_g;
+  orcenc_state_s* const s = &state_g;
   uint32_t stripe_id      = blockIdx.x;
   uint32_t cid            = (blockIdx.y) ? CI_DICTIONARY : CI_DATA2;
   int t                   = threadIdx.x;
@@ -953,7 +953,7 @@ __global__ void __launch_bounds__(block_size)
     s->nrows         = s->u.dict_stripe.num_strings;
     s->cur_row       = 0;
   }
-  column_device_view *string_column = s->u.dict_stripe.leaf_column;
+  column_device_view* string_column = s->u.dict_stripe.leaf_column;
   auto const dict_data              = s->u.dict_stripe.dict_data;
   __syncthreads();
   if (s->chunk.encoding_kind != DICTIONARY_V2) {
@@ -965,7 +965,7 @@ __global__ void __launch_bounds__(block_size)
     uint32_t string_idx = (t < numvals) ? dict_data[s->cur_row + t] : 0;
     if (cid == CI_DICTIONARY) {
       // Encoding string contents
-      const char *ptr = 0;
+      const char* ptr = 0;
       uint32_t count  = 0;
       if (t < numvals) {
         auto string_val = string_column->element<string_view>(string_idx);
@@ -1026,7 +1026,7 @@ __global__ void __launch_bounds__(1024)
 {
   __shared__ __align__(16) StripeStream ss;
   __shared__ __align__(16) encoder_chunk_streams strm0;
-  __shared__ uint8_t *volatile ck_curptr_g;
+  __shared__ uint8_t* volatile ck_curptr_g;
   __shared__ uint32_t volatile ck_curlen_g;
 
   auto const stripe_id = blockIdx.x;
@@ -1041,7 +1041,7 @@ __global__ void __launch_bounds__(1024)
   auto const cid = ss.stream_type;
   auto dst_ptr   = strm0.data_ptrs[cid] + strm0.lengths[cid];
   for (auto group = ss.first_chunk_id + 1; group < ss.first_chunk_id + ss.num_chunks; ++group) {
-    uint8_t *src_ptr;
+    uint8_t* src_ptr;
     uint32_t len;
     if (t == 0) {
       src_ptr = streams[ss.column_id][group].data_ptrs[cid];
@@ -1080,13 +1080,13 @@ __global__ void __launch_bounds__(1024)
 __global__ void __launch_bounds__(256)
   gpuInitCompressionBlocks(device_2dspan<StripeStream const> strm_desc,
                            device_2dspan<encoder_chunk_streams> streams,  // const?
-                           gpu_inflate_input_s *comp_in,
-                           gpu_inflate_status_s *comp_out,
-                           uint8_t *compressed_bfr,
+                           gpu_inflate_input_s* comp_in,
+                           gpu_inflate_status_s* comp_out,
+                           uint8_t* compressed_bfr,
                            uint32_t comp_blk_size)
 {
   __shared__ __align__(16) StripeStream ss;
-  __shared__ uint8_t *volatile uncomp_base_g;
+  __shared__ uint8_t* volatile uncomp_base_g;
 
   auto const stripe_id = blockIdx.x;
   auto const stream_id = blockIdx.y;
@@ -1103,8 +1103,8 @@ __global__ void __launch_bounds__(256)
   dst        = compressed_bfr + ss.bfr_offset;
   num_blocks = (ss.stream_size > 0) ? (ss.stream_size - 1) / comp_blk_size + 1 : 1;
   for (uint32_t b = t; b < num_blocks; b += 256) {
-    gpu_inflate_input_s *blk_in   = &comp_in[ss.first_block + b];
-    gpu_inflate_status_s *blk_out = &comp_out[ss.first_block + b];
+    gpu_inflate_input_s* blk_in   = &comp_in[ss.first_block + b];
+    gpu_inflate_status_s* blk_out = &comp_out[ss.first_block + b];
     uint32_t blk_size = min(comp_blk_size, ss.stream_size - min(b * comp_blk_size, ss.stream_size));
     blk_in->srcDevice = src + b * comp_blk_size;
     blk_in->srcSize   = blk_size;
@@ -1130,21 +1130,21 @@ __global__ void __launch_bounds__(256)
 // blockDim {1024,1,1}
 __global__ void __launch_bounds__(1024)
   gpuCompactCompressedBlocks(device_2dspan<StripeStream> strm_desc,
-                             gpu_inflate_input_s *comp_in,
-                             gpu_inflate_status_s *comp_out,
-                             uint8_t *compressed_bfr,
+                             gpu_inflate_input_s* comp_in,
+                             gpu_inflate_status_s* comp_out,
+                             uint8_t* compressed_bfr,
                              uint32_t comp_blk_size)
 {
   __shared__ __align__(16) StripeStream ss;
-  __shared__ const uint8_t *volatile comp_src_g;
+  __shared__ const uint8_t* volatile comp_src_g;
   __shared__ uint32_t volatile comp_len_g;
 
   auto const stripe_id = blockIdx.x;
   auto const stream_id = blockIdx.y;
   uint32_t t           = threadIdx.x;
   uint32_t num_blocks, b, blk_size;
-  const uint8_t *src;
-  uint8_t *dst;
+  const uint8_t* src;
+  uint8_t* dst;
 
   if (t == 0) ss = strm_desc[stripe_id][stream_id];
   __syncthreads();
@@ -1154,21 +1154,21 @@ __global__ void __launch_bounds__(1024)
   b          = 0;
   do {
     if (t == 0) {
-      gpu_inflate_input_s *blk_in   = &comp_in[ss.first_block + b];
-      gpu_inflate_status_s *blk_out = &comp_out[ss.first_block + b];
+      gpu_inflate_input_s* blk_in   = &comp_in[ss.first_block + b];
+      gpu_inflate_status_s* blk_out = &comp_out[ss.first_block + b];
       uint32_t src_len =
         min(comp_blk_size, ss.stream_size - min(b * comp_blk_size, ss.stream_size));
       uint32_t dst_len = (blk_out->status == 0) ? blk_out->bytes_written : src_len;
       uint32_t blk_size24;
       if (dst_len >= src_len) {
         // Copy from uncompressed source
-        src                    = static_cast<const uint8_t *>(blk_in->srcDevice);
+        src                    = static_cast<const uint8_t*>(blk_in->srcDevice);
         blk_out->bytes_written = src_len;
         dst_len                = src_len;
         blk_size24             = dst_len * 2 + 1;
       } else {
         // Compressed block
-        src        = static_cast<const uint8_t *>(blk_in->dstDevice);
+        src        = static_cast<const uint8_t*>(blk_in->dstDevice);
         blk_size24 = dst_len * 2 + 0;
       }
       dst[0]     = static_cast<uint8_t>(blk_size24 >> 0);
@@ -1207,7 +1207,7 @@ void EncodeOrcColumnData(device_2dspan<EncChunk const> chunks,
   gpuEncodeOrcColumnData<512><<<dim_grid, dim_block, 0, stream.value()>>>(chunks, streams);
 }
 
-void EncodeStripeDictionaries(StripeDictionary *stripes,
+void EncodeStripeDictionaries(StripeDictionary* stripes,
                               device_2dspan<EncChunk const> chunks,
                               uint32_t num_string_columns,
                               uint32_t num_stripes,
@@ -1220,7 +1220,7 @@ void EncodeStripeDictionaries(StripeDictionary *stripes,
     <<<dim_grid, dim_block, 0, stream.value()>>>(stripes, chunks, enc_streams);
 }
 
-void set_chunk_columns(const table_device_view &view,
+void set_chunk_columns(const table_device_view& view,
                        device_2dspan<EncChunk> chunks,
                        rmm::cuda_stream_view stream)
 {
@@ -1239,14 +1239,14 @@ void CompactOrcDataStreams(device_2dspan<StripeStream> strm_desc,
   gpuCompactOrcDataStreams<<<dim_grid, dim_block, 0, stream.value()>>>(strm_desc, enc_streams);
 }
 
-void CompressOrcDataStreams(uint8_t *compressed_data,
+void CompressOrcDataStreams(uint8_t* compressed_data,
                             uint32_t num_compressed_blocks,
                             CompressionKind compression,
                             uint32_t comp_blk_size,
                             device_2dspan<StripeStream> strm_desc,
                             device_2dspan<encoder_chunk_streams> enc_streams,
-                            gpu_inflate_input_s *comp_in,
-                            gpu_inflate_status_s *comp_out,
+                            gpu_inflate_input_s* comp_in,
+                            gpu_inflate_status_s* comp_out,
                             rmm::cuda_stream_view stream)
 {
   dim3 dim_block_init(256, 1);
diff --git a/cpp/src/io/orc/stripe_init.cu b/cpp/src/io/orc/stripe_init.cu
index 42cb15a56b7..317b7255718 100644
--- a/cpp/src/io/orc/stripe_init.cu
+++ b/cpp/src/io/orc/stripe_init.cu
@@ -32,11 +32,11 @@ struct compressed_stream_s {
 
 // blockDim {128,1,1}
 extern "C" __global__ void __launch_bounds__(128, 8) gpuParseCompressedStripeData(
-  CompressedStreamInfo *strm_info, int32_t num_streams, uint32_t block_size, uint32_t log2maxcr)
+  CompressedStreamInfo* strm_info, int32_t num_streams, uint32_t block_size, uint32_t log2maxcr)
 {
   __shared__ compressed_stream_s strm_g[4];
 
-  compressed_stream_s *const s = &strm_g[threadIdx.x / 32];
+  compressed_stream_s* const s = &strm_g[threadIdx.x / 32];
   int strm_id                  = blockIdx.x * 4 + (threadIdx.x / 32);
   int lane_id                  = threadIdx.x % 32;
 
@@ -45,9 +45,9 @@ extern "C" __global__ void __launch_bounds__(128, 8) gpuParseCompressedStripeDat
   __syncthreads();
   if (strm_id < num_streams) {
     // Walk through the compressed blocks
-    const uint8_t *cur               = s->info.compressed_data;
-    const uint8_t *end               = cur + s->info.compressed_data_size;
-    uint8_t *uncompressed            = s->info.uncompressed_data;
+    const uint8_t* cur               = s->info.compressed_data;
+    const uint8_t* end               = cur + s->info.compressed_data_size;
+    uint8_t* uncompressed            = s->info.uncompressed_data;
     size_t max_uncompressed_size     = 0;
     uint32_t num_compressed_blocks   = 0;
     uint32_t num_uncompressed_blocks = 0;
@@ -55,7 +55,7 @@ extern "C" __global__ void __launch_bounds__(128, 8) gpuParseCompressedStripeDat
       uint32_t block_len = shuffle((lane_id == 0) ? cur[0] | (cur[1] << 8) | (cur[2] << 16) : 0);
       uint32_t is_uncompressed = block_len & 1;
       uint32_t uncompressed_size;
-      gpu_inflate_input_s *init_ctl = nullptr;
+      gpu_inflate_input_s* init_ctl = nullptr;
       block_len >>= 1;
       cur += 3;
       if (block_len > block_size || cur + block_len > end) {
@@ -67,10 +67,9 @@ extern "C" __global__ void __launch_bounds__(128, 8) gpuParseCompressedStripeDat
       // TBD: For some codecs like snappy, it wouldn't be too difficult to get the actual
       // uncompressed size and avoid waste due to block size alignment For now, rely on the max
       // compression ratio to limit waste for the most extreme cases (small single-block streams)
-      uncompressed_size =
-        (is_uncompressed)
-          ? block_len
-          : (block_len < (block_size >> log2maxcr)) ? block_len << log2maxcr : block_size;
+      uncompressed_size = (is_uncompressed)                         ? block_len
+                          : (block_len < (block_size >> log2maxcr)) ? block_len << log2maxcr
+                                                                    : block_size;
       if (is_uncompressed) {
         if (uncompressed_size <= 32) {
           // For short blocks, copy the uncompressed data to output
@@ -94,7 +93,7 @@ extern "C" __global__ void __launch_bounds__(128, 8) gpuParseCompressedStripeDat
         num_compressed_blocks++;
       }
       if (!lane_id && init_ctl) {
-        s->ctl.srcDevice = const_cast<uint8_t *>(cur);
+        s->ctl.srcDevice = const_cast<uint8_t*>(cur);
         s->ctl.srcSize   = block_len;
         s->ctl.dstDevice = uncompressed + max_uncompressed_size;
         s->ctl.dstSize   = uncompressed_size;
@@ -118,11 +117,11 @@ extern "C" __global__ void __launch_bounds__(128, 8) gpuParseCompressedStripeDat
 
 // blockDim {128,1,1}
 extern "C" __global__ void __launch_bounds__(128, 8)
-  gpuPostDecompressionReassemble(CompressedStreamInfo *strm_info, int32_t num_streams)
+  gpuPostDecompressionReassemble(CompressedStreamInfo* strm_info, int32_t num_streams)
 {
   __shared__ compressed_stream_s strm_g[4];
 
-  compressed_stream_s *const s = &strm_g[threadIdx.x / 32];
+  compressed_stream_s* const s = &strm_g[threadIdx.x / 32];
   int strm_id                  = blockIdx.x * 4 + (threadIdx.x / 32);
   int lane_id                  = threadIdx.x % 32;
 
@@ -133,12 +132,12 @@ extern "C" __global__ void __launch_bounds__(128, 8)
       s->info.num_compressed_blocks + s->info.num_uncompressed_blocks > 0 &&
       s->info.max_uncompressed_size > 0) {
     // Walk through the compressed blocks
-    const uint8_t *cur                  = s->info.compressed_data;
-    const uint8_t *end                  = cur + s->info.compressed_data_size;
-    const gpu_inflate_input_s *dec_in   = s->info.decctl;
-    const gpu_inflate_status_s *dec_out = s->info.decstatus;
-    uint8_t *uncompressed_actual        = s->info.uncompressed_data;
-    uint8_t *uncompressed_estimated     = uncompressed_actual;
+    const uint8_t* cur                  = s->info.compressed_data;
+    const uint8_t* end                  = cur + s->info.compressed_data_size;
+    const gpu_inflate_input_s* dec_in   = s->info.decctl;
+    const gpu_inflate_status_s* dec_out = s->info.decstatus;
+    uint8_t* uncompressed_actual        = s->info.uncompressed_data;
+    uint8_t* uncompressed_estimated     = uncompressed_actual;
     uint32_t num_compressed_blocks      = 0;
     uint32_t max_compressed_blocks      = s->info.num_compressed_blocks;
 
@@ -159,9 +158,9 @@ extern "C" __global__ void __launch_bounds__(128, 8)
           break;
         }
         uncompressed_size_est =
-          shuffle((lane_id == 0) ? *(const uint32_t *)&dec_in[num_compressed_blocks].dstSize : 0);
+          shuffle((lane_id == 0) ? *(const uint32_t*)&dec_in[num_compressed_blocks].dstSize : 0);
         uncompressed_size_actual = shuffle(
-          (lane_id == 0) ? *(const uint32_t *)&dec_out[num_compressed_blocks].bytes_written : 0);
+          (lane_id == 0) ? *(const uint32_t*)&dec_out[num_compressed_blocks].bytes_written : 0);
       }
       // In practice, this should never happen with a well-behaved writer, as we would expect the
       // uncompressed size to always be equal to the compression block size except for the last
@@ -219,13 +218,13 @@ enum row_entry_state_e {
  * @param[in] end end of byte stream
  * @return bytes consumed
  */
-static uint32_t __device__ ProtobufParseRowIndexEntry(rowindex_state_s *s,
-                                                      const uint8_t *start,
-                                                      const uint8_t *end)
+static uint32_t __device__ ProtobufParseRowIndexEntry(rowindex_state_s* s,
+                                                      const uint8_t* start,
+                                                      const uint8_t* end)
 {
   constexpr uint32_t pb_rowindexentry_id = static_cast<uint32_t>(PB_TYPE_FIXEDLEN) + 8;
 
-  const uint8_t *cur      = start;
+  const uint8_t* cur      = start;
   row_entry_state_e state = NOT_FOUND;
   uint32_t length = 0, strm_idx_id = s->chunk.skip_count >> 8, idx_id = 1, ci_id = CI_PRESENT,
            pos_end = 0;
@@ -268,9 +267,9 @@ static uint32_t __device__ ProtobufParseRowIndexEntry(rowindex_state_s *s,
         }
         break;
       case STORE_INDEX0:
-        ci_id = (idx_id == (strm_idx_id & 0xff))
-                  ? CI_DATA
-                  : (idx_id == ((strm_idx_id >> 8) & 0xff)) ? CI_DATA2 : CI_PRESENT;
+        ci_id = (idx_id == (strm_idx_id & 0xff))          ? CI_DATA
+                : (idx_id == ((strm_idx_id >> 8) & 0xff)) ? CI_DATA2
+                                                          : CI_PRESENT;
         idx_id++;
         if (s->is_compressed) {
           if (ci_id < CI_PRESENT) s->row_index_entry[0][ci_id] = v;
@@ -313,9 +312,9 @@ static uint32_t __device__ ProtobufParseRowIndexEntry(rowindex_state_s *s,
  * @param[in,out] s row group index state
  * @param[in] num_rowgroups Number of index entries to read
  */
-static __device__ void gpuReadRowGroupIndexEntries(rowindex_state_s *s, int num_rowgroups)
+static __device__ void gpuReadRowGroupIndexEntries(rowindex_state_s* s, int num_rowgroups)
 {
-  const uint8_t *index_data = s->chunk.streams[CI_INDEX];
+  const uint8_t* index_data = s->chunk.streams[CI_INDEX];
   int index_data_len        = s->chunk.strm_len[CI_INDEX];
   for (int i = 0; i < num_rowgroups; i++) {
     s->row_index_entry[0][0] = 0;
@@ -347,7 +346,7 @@ static __device__ void gpuReadRowGroupIndexEntries(rowindex_state_s *s, int num_
  * @param[in] num_rowgroups Number of index entries
  * @param[in] t thread id
  */
-static __device__ void gpuMapRowIndexToUncompressed(rowindex_state_s *s,
+static __device__ void gpuMapRowIndexToUncompressed(rowindex_state_s* s,
                                                     int ci_id,
                                                     int num_rowgroups,
                                                     int t)
@@ -356,10 +355,10 @@ static __device__ void gpuMapRowIndexToUncompressed(rowindex_state_s *s,
   if (strm_len > 0) {
     int32_t compressed_offset = (t < num_rowgroups) ? s->compressed_offset[t][ci_id] : 0;
     if (compressed_offset > 0) {
-      const uint8_t *start            = s->strm_info[ci_id].compressed_data;
-      const uint8_t *cur              = start;
-      const uint8_t *end              = cur + s->strm_info[ci_id].compressed_data_size;
-      gpu_inflate_status_s *decstatus = s->strm_info[ci_id].decstatus;
+      const uint8_t* start            = s->strm_info[ci_id].compressed_data;
+      const uint8_t* cur              = start;
+      const uint8_t* end              = cur + s->strm_info[ci_id].compressed_data_size;
+      gpu_inflate_status_s* decstatus = s->strm_info[ci_id].decstatus;
       uint32_t uncomp_offset          = 0;
       for (;;) {
         uint32_t block_len, is_uncompressed;
@@ -392,19 +391,23 @@ static __device__ void gpuMapRowIndexToUncompressed(rowindex_state_s *s,
  * @param[in] num_columns Number of columns
  * @param[in] num_stripes Number of stripes
  * @param[in] num_rowgroups Number of row groups
+ * @param[in] rowidx_stride Row index stride
+ * @param[in] use_base_stride Whether to use base stride obtained from meta or use the computed
+ * value
  */
 // blockDim {128,1,1}
 extern "C" __global__ void __launch_bounds__(128, 8)
-  gpuParseRowGroupIndex(RowGroup *row_groups,
-                        CompressedStreamInfo *strm_info,
-                        ColumnDesc *chunks,
+  gpuParseRowGroupIndex(RowGroup* row_groups,
+                        CompressedStreamInfo* strm_info,
+                        ColumnDesc* chunks,
                         uint32_t num_columns,
                         uint32_t num_stripes,
                         uint32_t num_rowgroups,
-                        uint32_t rowidx_stride)
+                        uint32_t rowidx_stride,
+                        bool use_base_stride)
 {
   __shared__ __align__(16) rowindex_state_s state_g;
-  rowindex_state_s *const s = &state_g;
+  rowindex_state_s* const s = &state_g;
   uint32_t chunk_id         = blockIdx.y * num_columns + blockIdx.x;
   int t                     = threadIdx.x;
 
@@ -415,11 +418,10 @@ extern "C" __global__ void __launch_bounds__(128, 8)
       if (s->chunk.strm_len[1] > 0) s->strm_info[1] = strm_info[s->chunk.strm_id[1]];
     }
 
-    uint32_t rowgroups_in_chunk =
-      (rowidx_stride > 0) ? (s->chunk.num_rows + rowidx_stride - 1) / rowidx_stride : 1;
-    s->rowgroup_start = s->chunk.rowgroup_id;
-    s->rowgroup_end   = s->rowgroup_start + rowgroups_in_chunk;
-    s->is_compressed  = (strm_info != NULL);
+    uint32_t rowgroups_in_chunk = s->chunk.num_rowgroups;
+    s->rowgroup_start           = s->chunk.rowgroup_id;
+    s->rowgroup_end             = s->rowgroup_start + rowgroups_in_chunk;
+    s->is_compressed            = (strm_info != NULL);
   }
   __syncthreads();
   while (s->rowgroup_start < s->rowgroup_end) {
@@ -443,10 +445,19 @@ extern "C" __global__ void __launch_bounds__(128, 8)
     t4             = t & 3;
     t32            = t >> 2;
     for (int i = t32; i < num_rowgroups; i += 32) {
+      auto const num_rows =
+        (use_base_stride) ? rowidx_stride
+                          : row_groups[(s->rowgroup_start + i) * num_columns + blockIdx.x].num_rows;
+      auto const start_row =
+        (use_base_stride)
+          ? rowidx_stride
+          : row_groups[(s->rowgroup_start + i) * num_columns + blockIdx.x].start_row;
       for (int j = t4; j < rowgroup_size4; j += 4) {
-        ((uint32_t *)&row_groups[(s->rowgroup_start + i) * num_columns + blockIdx.x])[j] =
-          ((volatile uint32_t *)&s->rowgroups[i])[j];
+        ((uint32_t*)&row_groups[(s->rowgroup_start + i) * num_columns + blockIdx.x])[j] =
+          ((volatile uint32_t*)&s->rowgroups[i])[j];
       }
+      row_groups[(s->rowgroup_start + i) * num_columns + blockIdx.x].num_rows  = num_rows;
+      row_groups[(s->rowgroup_start + i) * num_columns + blockIdx.x].start_row = start_row;
     }
     __syncthreads();
     if (t == 0) { s->rowgroup_start += num_rowgroups; }
@@ -454,7 +465,7 @@ extern "C" __global__ void __launch_bounds__(128, 8)
   }
 }
 
-void __host__ ParseCompressedStripeData(CompressedStreamInfo *strm_info,
+void __host__ ParseCompressedStripeData(CompressedStreamInfo* strm_info,
                                         int32_t num_streams,
                                         uint32_t compression_block_size,
                                         uint32_t log2maxcr,
@@ -466,7 +477,7 @@ void __host__ ParseCompressedStripeData(CompressedStreamInfo *strm_info,
     strm_info, num_streams, compression_block_size, log2maxcr);
 }
 
-void __host__ PostDecompressionReassemble(CompressedStreamInfo *strm_info,
+void __host__ PostDecompressionReassemble(CompressedStreamInfo* strm_info,
                                           int32_t num_streams,
                                           rmm::cuda_stream_view stream)
 {
@@ -485,21 +496,31 @@ void __host__ PostDecompressionReassemble(CompressedStreamInfo *strm_info,
  * @param[in] num_columns Number of columns
  * @param[in] num_stripes Number of stripes
  * @param[in] num_rowgroups Number of row groups
+ * @param[in] rowidx_stride Row index stride
+ * @param[in] use_base_stride Whether to use base stride obtained from meta or use the computed
+ * value
  * @param[in] stream CUDA stream to use, default `rmm::cuda_stream_default`
  */
-void __host__ ParseRowGroupIndex(RowGroup *row_groups,
-                                 CompressedStreamInfo *strm_info,
-                                 ColumnDesc *chunks,
+void __host__ ParseRowGroupIndex(RowGroup* row_groups,
+                                 CompressedStreamInfo* strm_info,
+                                 ColumnDesc* chunks,
                                  uint32_t num_columns,
                                  uint32_t num_stripes,
                                  uint32_t num_rowgroups,
                                  uint32_t rowidx_stride,
+                                 bool use_base_stride,
                                  rmm::cuda_stream_view stream)
 {
   dim3 dim_block(128, 1);
   dim3 dim_grid(num_columns, num_stripes);  // 1 column chunk per block
-  gpuParseRowGroupIndex<<<dim_grid, dim_block, 0, stream.value()>>>(
-    row_groups, strm_info, chunks, num_columns, num_stripes, num_rowgroups, rowidx_stride);
+  gpuParseRowGroupIndex<<<dim_grid, dim_block, 0, stream.value()>>>(row_groups,
+                                                                    strm_info,
+                                                                    chunks,
+                                                                    num_columns,
+                                                                    num_stripes,
+                                                                    num_rowgroups,
+                                                                    rowidx_stride,
+                                                                    use_base_stride);
 }
 
 }  // namespace gpu
diff --git a/cpp/src/io/orc/timezone.cpp b/cpp/src/io/orc/timezone.cpp
index 81ffa954c1a..f5bda3401c0 100644
--- a/cpp/src/io/orc/timezone.cpp
+++ b/cpp/src/io/orc/timezone.cpp
@@ -76,7 +76,7 @@ struct timezone_file {
   {
     return (is_64bit ? sizeof(uint64_t) : sizeof(uint32_t)) + sizeof(uint32_t);
   }
-  static constexpr auto file_content_size_32(timezone_file_header const &header) noexcept
+  static constexpr auto file_content_size_32(timezone_file_header const& header) noexcept
   {
     return header.timecnt * sizeof(uint32_t) +                 // transition times
            header.timecnt * sizeof(uint8_t) +                  // transition time index
@@ -100,9 +100,9 @@ struct timezone_file {
     header.charcnt  = __builtin_bswap32(header.charcnt);
   }
 
-  void read_header(std::ifstream &input_file, size_t file_size)
+  void read_header(std::ifstream& input_file, size_t file_size)
   {
-    input_file.read(reinterpret_cast<char *>(&header), sizeof(header));
+    input_file.read(reinterpret_cast<char*>(&header), sizeof(header));
     CUDF_EXPECTS(!input_file.fail() && header.magic == tzif_magic,
                  "Error reading time zones file header.");
     header_to_little_endian();
@@ -113,7 +113,7 @@ struct timezone_file {
         // skip the 32-bit content
         input_file.seekg(file_content_size_32(header), std::ios_base::cur);
         // read the 64-bit header
-        input_file.read(reinterpret_cast<char *>(&header), sizeof(header));
+        input_file.read(reinterpret_cast<char*>(&header), sizeof(header));
         header_to_little_endian();
         is_header_from_64bit = true;
       }
@@ -125,7 +125,7 @@ struct timezone_file {
                  "Number of transition times is larger than the file size.");
   }
 
-  timezone_file(std::string const &timezone_name)
+  timezone_file(std::string const& timezone_name)
   {
     using std::ios_base;
 
@@ -142,23 +142,25 @@ struct timezone_file {
     // Read transition times (convert from 32-bit to 64-bit if necessary)
     transition_times.resize(timecnt());
     if (is_header_from_64bit) {
-      fin.read(reinterpret_cast<char *>(transition_times.data()),
+      fin.read(reinterpret_cast<char*>(transition_times.data()),
                transition_times.size() * sizeof(int64_t));
-      for (auto &tt : transition_times) { tt = __builtin_bswap64(tt); }
+      for (auto& tt : transition_times) {
+        tt = __builtin_bswap64(tt);
+      }
     } else {
       std::vector<int32_t> tt32(timecnt());
-      fin.read(reinterpret_cast<char *>(tt32.data()), tt32.size() * sizeof(int32_t));
+      fin.read(reinterpret_cast<char*>(tt32.data()), tt32.size() * sizeof(int32_t));
       std::transform(
-        tt32.cbegin(), tt32.cend(), std::back_inserter(transition_times), [](auto &tt) {
+        tt32.cbegin(), tt32.cend(), std::back_inserter(transition_times), [](auto& tt) {
           return __builtin_bswap32(tt);
         });
     }
     ttime_idx.resize(timecnt());
-    fin.read(reinterpret_cast<char *>(ttime_idx.data()), timecnt() * sizeof(uint8_t));
+    fin.read(reinterpret_cast<char*>(ttime_idx.data()), timecnt() * sizeof(uint8_t));
 
     // Read time types
     ttype.resize(typecnt());
-    fin.read(reinterpret_cast<char *>(ttype.data()), typecnt() * sizeof(localtime_type_record_s));
+    fin.read(reinterpret_cast<char*>(ttype.data()), typecnt() * sizeof(localtime_type_record_s));
     CUDF_EXPECTS(!fin.fail(), "Failed to read time types from the time zone file.");
     for (uint32_t i = 0; i < typecnt(); i++) {
       ttype[i].utcoff = __builtin_bswap32(ttype[i].utcoff);
@@ -182,7 +184,7 @@ struct timezone_file {
 template <class Container>
 class posix_parser {
  public:
-  posix_parser(Container const &tz_string) : cur{tz_string.begin()}, end{tz_string.end()} {}
+  posix_parser(Container const& tz_string) : cur{tz_string.begin()}, end{tz_string.end()} {}
 
   /**
    * @brief Advances the parser past a name from the posix TZ string.
@@ -340,7 +342,7 @@ static int days_in_month(int month, bool is_leap_year)
  *
  * @return transition time in seconds from the beginning of the year
  */
-static int64_t get_transition_time(dst_transition_s const &trans, int year)
+static int64_t get_transition_time(dst_transition_s const& trans, int year)
 {
   auto day = trans.day;
 
@@ -365,7 +367,9 @@ static int64_t get_transition_time(dst_transition_s const &trans, int year)
       day += 7;
     }
     // Add months
-    for (int m = 1; m < month; m++) { day += days_in_month(m, is_leap); }
+    for (int m = 1; m < month; m++) {
+      day += days_in_month(m, is_leap);
+    }
   } else if (trans.type == 'J') {
     // Account for 29th of February on leap years
     day += (day > 31 + 29 && is_leap_year(year));
@@ -374,7 +378,7 @@ static int64_t get_transition_time(dst_transition_s const &trans, int year)
   return trans.time + day * day_seconds;
 }
 
-timezone_table build_timezone_transition_table(std::string const &timezone_name,
+timezone_table build_timezone_transition_table(std::string const& timezone_name,
                                                rmm::cuda_stream_view stream)
 {
   if (timezone_name == "UTC" || timezone_name.empty()) {
diff --git a/cpp/src/io/orc/timezone.cuh b/cpp/src/io/orc/timezone.cuh
index b0231ca9e7d..e5341573418 100644
--- a/cpp/src/io/orc/timezone.cuh
+++ b/cpp/src/io/orc/timezone.cuh
@@ -56,8 +56,8 @@ static constexpr uint32_t cycle_entry_cnt = 2 * cycle_years;
  *
  * @return GMT offset
  */
-CUDA_HOST_DEVICE_CALLABLE int32_t get_gmt_offset_impl(int64_t const *ttimes,
-                                                      int32_t const *offsets,
+CUDA_HOST_DEVICE_CALLABLE int32_t get_gmt_offset_impl(int64_t const* ttimes,
+                                                      int32_t const* offsets,
                                                       size_t count,
                                                       int64_t ts)
 {
@@ -112,8 +112,8 @@ struct timezone_table {
   rmm::device_uvector<int32_t> offsets;
   timezone_table() : ttimes{0, rmm::cuda_stream_default}, offsets{0, rmm::cuda_stream_default} {}
   timezone_table(int32_t gmt_offset,
-                 rmm::device_uvector<int64_t> &&ttimes,
-                 rmm::device_uvector<int32_t> &&offsets)
+                 rmm::device_uvector<int64_t>&& ttimes,
+                 rmm::device_uvector<int32_t>&& offsets)
     : gmt_offset{gmt_offset}, ttimes{std::move(ttimes)}, offsets{std::move(offsets)}
   {
   }
@@ -130,7 +130,7 @@ struct timezone_table {
  *
  * @return The transition table for the given timezone
  */
-timezone_table build_timezone_transition_table(std::string const &timezone_name,
+timezone_table build_timezone_transition_table(std::string const& timezone_name,
                                                rmm::cuda_stream_view stream);
 
 }  // namespace io
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index 4a2330d479b..0cd3f333ba3 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -138,8 +138,8 @@ class orc_column_view {
    */
   explicit orc_column_view(size_t index,
                            size_t str_id,
-                           column_view const &col,
-                           const table_metadata *metadata,
+                           column_view const& col,
+                           const table_metadata* metadata,
                            rmm::cuda_stream_view stream)
     : _index(index),
       _str_id(str_id),
@@ -168,7 +168,7 @@ class orc_column_view {
   /**
    * @brief Function that associates an existing dictionary chunk allocation
    */
-  void attach_dict_chunk(gpu::DictionaryChunk *host_dict, gpu::DictionaryChunk *dev_dict)
+  void attach_dict_chunk(gpu::DictionaryChunk* host_dict, gpu::DictionaryChunk* dev_dict)
   {
     dict   = host_dict;
     d_dict = dev_dict;
@@ -180,14 +180,14 @@ class orc_column_view {
   }
   auto device_dict_chunk() const { return d_dict; }
 
-  auto const &decimal_offsets() const { return d_decimal_offsets; }
-  void attach_decimal_offsets(uint32_t *sizes_ptr) { d_decimal_offsets = sizes_ptr; }
+  auto const& decimal_offsets() const { return d_decimal_offsets; }
+  void attach_decimal_offsets(uint32_t* sizes_ptr) { d_decimal_offsets = sizes_ptr; }
 
   /**
    * @brief Function that associates an existing stripe dictionary allocation
    */
-  void attach_stripe_dict(gpu::StripeDictionary *host_stripe_dict,
-                          gpu::StripeDictionary *dev_stripe_dict)
+  void attach_stripe_dict(gpu::StripeDictionary* host_stripe_dict,
+                          gpu::StripeDictionary* dev_stripe_dict)
   {
     stripe_dict   = host_stripe_dict;
     d_stripe_dict = dev_stripe_dict;
@@ -207,7 +207,7 @@ class orc_column_view {
   auto data_count() const noexcept { return _data_count; }
   size_t null_count() const noexcept { return _null_count; }
   bool nullable() const noexcept { return (_nulls != nullptr); }
-  uint32_t const *nulls() const noexcept { return _nulls; }
+  uint32_t const* nulls() const noexcept { return _nulls; }
 
   auto scale() const noexcept { return _scale; }
   auto precision() const noexcept { return _precision; }
@@ -226,7 +226,7 @@ class orc_column_view {
   size_t _type_width     = 0;
   size_type _data_count  = 0;
   size_t _null_count     = 0;
-  uint32_t const *_nulls = nullptr;
+  uint32_t const* _nulls = nullptr;
 
   // ORC-related members
   std::string _name{};
@@ -238,21 +238,21 @@ class orc_column_view {
 
   // String dictionary-related members
   size_t dict_stride                       = 0;
-  gpu::DictionaryChunk const *dict         = nullptr;
-  gpu::StripeDictionary const *stripe_dict = nullptr;
-  gpu::DictionaryChunk *d_dict             = nullptr;
-  gpu::StripeDictionary *d_stripe_dict     = nullptr;
+  gpu::DictionaryChunk const* dict         = nullptr;
+  gpu::StripeDictionary const* stripe_dict = nullptr;
+  gpu::DictionaryChunk* d_dict             = nullptr;
+  gpu::StripeDictionary* d_stripe_dict     = nullptr;
 
   // Offsets for encoded decimal elements. Used to enable direct writing of encoded decimal elements
   // into the output stream.
-  uint32_t *d_decimal_offsets = nullptr;
+  uint32_t* d_decimal_offsets = nullptr;
 };
 
 std::vector<stripe_rowgroups> writer::impl::gather_stripe_info(
   host_span<orc_column_view const> columns, size_t num_rowgroups)
 {
   auto const is_any_column_string =
-    std::any_of(columns.begin(), columns.end(), [](auto const &col) { return col.is_string(); });
+    std::any_of(columns.begin(), columns.end(), [](auto const& col) { return col.is_string(); });
   // Apply rows per stripe limit to limit string dictionaries
   size_t const max_stripe_rows = is_any_column_string ? 1000000 : 5000000;
 
@@ -260,7 +260,7 @@ std::vector<stripe_rowgroups> writer::impl::gather_stripe_info(
   for (size_t rowgroup = 0, stripe_start = 0, stripe_size = 0; rowgroup < num_rowgroups;
        ++rowgroup) {
     auto const rowgroup_size =
-      std::accumulate(columns.begin(), columns.end(), 0ul, [&](size_t total_size, auto const &col) {
+      std::accumulate(columns.begin(), columns.end(), 0ul, [&](size_t total_size, auto const& col) {
         if (col.is_string()) {
           const auto dt = col.host_dict_chunk(rowgroup);
           return total_size + row_index_stride_ + dt->string_char_count;
@@ -285,19 +285,19 @@ std::vector<stripe_rowgroups> writer::impl::gather_stripe_info(
   return infos;
 }
 
-void writer::impl::init_dictionaries(const table_device_view &view,
-                                     orc_column_view *columns,
-                                     std::vector<int> const &str_col_ids,
+void writer::impl::init_dictionaries(const table_device_view& view,
+                                     orc_column_view* columns,
+                                     std::vector<int> const& str_col_ids,
                                      device_span<size_type> d_str_col_ids,
-                                     uint32_t *dict_data,
-                                     uint32_t *dict_index,
-                                     hostdevice_vector<gpu::DictionaryChunk> *dict)
+                                     uint32_t* dict_data,
+                                     uint32_t* dict_index,
+                                     hostdevice_vector<gpu::DictionaryChunk>* dict)
 {
   const size_t num_rowgroups = dict->size() / str_col_ids.size();
 
   // Setup per-rowgroup dictionary indexes for each dictionary-aware column
   for (size_t i = 0; i < str_col_ids.size(); ++i) {
-    auto &str_column = columns[str_col_ids[i]];
+    auto& str_column = columns[str_col_ids[i]];
     str_column.set_dict_stride(str_col_ids.size());
     str_column.attach_dict_chunk(dict->host_ptr(), dict->device_ptr());
   }
@@ -314,21 +314,21 @@ void writer::impl::init_dictionaries(const table_device_view &view,
   dict->device_to_host(stream, true);
 }
 
-void writer::impl::build_dictionaries(orc_column_view *columns,
-                                      std::vector<int> const &str_col_ids,
+void writer::impl::build_dictionaries(orc_column_view* columns,
+                                      std::vector<int> const& str_col_ids,
                                       host_span<stripe_rowgroups const> stripe_bounds,
-                                      hostdevice_vector<gpu::DictionaryChunk> const &dict,
-                                      uint32_t *dict_index,
-                                      hostdevice_vector<gpu::StripeDictionary> &stripe_dict)
+                                      hostdevice_vector<gpu::DictionaryChunk> const& dict,
+                                      uint32_t* dict_index,
+                                      hostdevice_vector<gpu::StripeDictionary>& stripe_dict)
 {
   const auto num_rowgroups = dict.size() / str_col_ids.size();
 
   for (size_t col_idx = 0; col_idx < str_col_ids.size(); ++col_idx) {
-    auto &str_column = columns[str_col_ids[col_idx]];
+    auto& str_column = columns[str_col_ids[col_idx]];
     str_column.attach_stripe_dict(stripe_dict.host_ptr(), stripe_dict.device_ptr());
 
-    for (auto const &stripe : stripe_bounds) {
-      auto &sd           = stripe_dict[stripe.id * str_col_ids.size() + col_idx];
+    for (auto const& stripe : stripe_bounds) {
+      auto& sd           = stripe_dict[stripe.id * str_col_ids.size() + col_idx];
       sd.dict_data       = str_column.host_dict_chunk(stripe.first)->dict_data;
       sd.dict_index      = dict_index + col_idx * str_column.data_count();  // Indexed by abs row
       sd.column_id       = str_col_ids[col_idx];
@@ -337,7 +337,7 @@ void writer::impl::build_dictionaries(orc_column_view *columns,
       sd.dict_char_count = 0;
       sd.num_strings =
         std::accumulate(stripe.cbegin(), stripe.cend(), 0, [&](auto dt_str_cnt, auto rg_idx) {
-          const auto &dt = dict[rg_idx * str_col_ids.size() + col_idx];
+          const auto& dt = dict[rg_idx * str_col_ids.size() + col_idx];
           return dt_str_cnt + dt.num_dict_strings;
         });
       sd.leaf_column = dict[col_idx].leaf_column;
@@ -353,13 +353,13 @@ void writer::impl::build_dictionaries(orc_column_view *columns,
                         stripe_bounds.back().cend(),
                         string_column_cost{},
                         [&](auto cost, auto rg_idx) -> string_column_cost {
-                          const auto &dt = dict[rg_idx * str_col_ids.size() + col_idx];
+                          const auto& dt = dict[rg_idx * str_col_ids.size() + col_idx];
                           return {cost.direct + dt.string_char_count,
                                   cost.dictionary + dt.dict_char_count + dt.num_dict_strings};
                         });
       // Disable dictionary if it does not reduce the output size
       if (col_cost.dictionary >= col_cost.direct) {
-        for (auto const &stripe : stripe_bounds) {
+        for (auto const& stripe : stripe_bounds) {
           stripe_dict[stripe.id * str_col_ids.size() + col_idx].dict_data = nullptr;
         }
       }
@@ -379,19 +379,19 @@ void writer::impl::build_dictionaries(orc_column_view *columns,
 
 orc_streams writer::impl::create_streams(host_span<orc_column_view> columns,
                                          host_span<stripe_rowgroups const> stripe_bounds,
-                                         std::map<uint32_t, size_t> const &decimal_column_sizes)
+                                         std::map<uint32_t, size_t> const& decimal_column_sizes)
 {
   // 'column 0' row index stream
   std::vector<Stream> streams{{ROW_INDEX, 0}};  // TODO: Separate index and data streams?
   // First n + 1 streams are row index streams
   streams.reserve(columns.size() + 1);
-  std::transform(columns.begin(), columns.end(), std::back_inserter(streams), [](auto const &col) {
+  std::transform(columns.begin(), columns.end(), std::back_inserter(streams), [](auto const& col) {
     return Stream{ROW_INDEX, col.id()};
   });
 
   std::vector<int32_t> ids(columns.size() * gpu::CI_NUM_STREAMS, -1);
 
-  for (auto &column : columns) {
+  for (auto& column : columns) {
     TypeKind kind                    = column.orc_kind();
     StreamKind data_kind             = DATA;
     StreamKind data2_kind            = LENGTH;
@@ -454,7 +454,7 @@ orc_streams writer::impl::create_streams(host_span<orc_column_view> columns,
         size_t dict_data_size      = 0;
         size_t dict_strings        = 0;
         size_t dict_lengths_div512 = 0;
-        for (auto const &stripe : stripe_bounds) {
+        for (auto const& stripe : stripe_bounds) {
           const auto sd = column.host_stripe_dict(stripe.id);
           enable_dict   = (enable_dict && sd->dict_data != nullptr);
           if (enable_dict) {
@@ -546,13 +546,13 @@ orc_streams::orc_stream_offsets orc_streams::compute_offsets(
   size_t non_rle_data_size = 0;
   size_t rle_data_size     = 0;
   for (size_t i = 0; i < streams.size(); ++i) {
-    const auto &stream = streams[i];
+    const auto& stream = streams[i];
 
     auto const is_rle_data = [&]() {
       // First stream is an index stream, don't check types, etc.
       if (!stream.column_index().has_value()) return true;
 
-      auto const &column = columns[stream.column_index().value()];
+      auto const& column = columns[stream.column_index().value()];
       // Dictionary encoded string column - dictionary characters or
       // directly encoded string - column characters
       if (column.orc_kind() == TypeKind::STRING &&
@@ -581,18 +581,18 @@ orc_streams::orc_stream_offsets orc_streams::compute_offsets(
 }
 
 struct segmented_valid_cnt_input {
-  bitmask_type const *mask;
+  bitmask_type const* mask;
   std::vector<size_type> indices;
 };
 
-encoded_data writer::impl::encode_columns(const table_device_view &view,
+encoded_data writer::impl::encode_columns(const table_device_view& view,
                                           host_span<orc_column_view const> columns,
-                                          std::vector<int> const &str_col_ids,
-                                          rmm::device_uvector<uint32_t> &&dict_data,
-                                          rmm::device_uvector<uint32_t> &&dict_index,
-                                          encoder_decimal_info &&dec_chunk_sizes,
+                                          std::vector<int> const& str_col_ids,
+                                          rmm::device_uvector<uint32_t>&& dict_data,
+                                          rmm::device_uvector<uint32_t>&& dict_index,
+                                          encoder_decimal_info&& dec_chunk_sizes,
                                           host_span<stripe_rowgroups const> stripe_bounds,
-                                          orc_streams const &streams)
+                                          orc_streams const& streams)
 {
   auto const num_columns   = columns.size();
   auto const num_rowgroups = stripes_size(stripe_bounds);
@@ -604,11 +604,11 @@ encoded_data writer::impl::encode_columns(const table_device_view &view,
   // Initialize column chunks' descriptions
   std::map<size_type, segmented_valid_cnt_input> validity_check_inputs;
 
-  for (auto const &column : columns) {
-    for (auto const &stripe : stripe_bounds) {
+  for (auto const& column : columns) {
+    for (auto const& stripe : stripe_bounds) {
       for (auto rg_idx_it = stripe.cbegin(); rg_idx_it < stripe.cend(); ++rg_idx_it) {
         auto const rg_idx = *rg_idx_it;
-        auto &ck          = chunks[column.index()][rg_idx];
+        auto& ck          = chunks[column.index()][rg_idx];
 
         ck.start_row = (rg_idx * row_index_stride_);
         ck.num_rows  = std::min<uint32_t>(row_index_stride_, column.data_count() - ck.start_row);
@@ -618,7 +618,7 @@ encoded_data writer::impl::encode_columns(const table_device_view &view,
           ck.dict_index = (ck.encoding_kind == DICTIONARY_V2)
                             ? column.host_stripe_dict(stripe.id)->dict_index
                             : nullptr;
-          ck.dtype_len = 1;
+          ck.dtype_len  = 1;
         } else {
           ck.dtype_len = column.type_width();
         }
@@ -632,22 +632,22 @@ encoded_data writer::impl::encode_columns(const table_device_view &view,
 
   auto validity_check_indices = [&](size_t col_idx) {
     std::vector<size_type> indices;
-    for (auto const &stripe : stripe_bounds) {
+    for (auto const& stripe : stripe_bounds) {
       for (auto rg_idx_it = stripe.cbegin(); rg_idx_it < stripe.cend() - 1; ++rg_idx_it) {
-        auto const &chunk = chunks[col_idx][*rg_idx_it];
+        auto const& chunk = chunks[col_idx][*rg_idx_it];
         indices.push_back(chunk.start_row);
         indices.push_back(chunk.start_row + chunk.num_rows);
       }
     }
     return indices;
   };
-  for (auto const &column : columns) {
+  for (auto const& column : columns) {
     if (column.orc_kind() == TypeKind::BOOLEAN && column.nullable()) {
       validity_check_inputs[column.index()] = {column.nulls(),
                                                validity_check_indices(column.index())};
     }
   }
-  for (auto &cnt_in : validity_check_inputs) {
+  for (auto& cnt_in : validity_check_inputs) {
     auto const valid_counts = segmented_count_set_bits(cnt_in.second.mask, cnt_in.second.indices);
     CUDF_EXPECTS(
       std::none_of(valid_counts.cbegin(),
@@ -659,13 +659,13 @@ encoded_data writer::impl::encode_columns(const table_device_view &view,
   }
 
   for (size_t col_idx = 0; col_idx < num_columns; col_idx++) {
-    auto const &column = columns[col_idx];
+    auto const& column = columns[col_idx];
     auto col_streams   = chunk_streams[col_idx];
-    for (auto const &stripe : stripe_bounds) {
+    for (auto const& stripe : stripe_bounds) {
       for (auto rg_idx_it = stripe.cbegin(); rg_idx_it < stripe.cend(); ++rg_idx_it) {
         auto const rg_idx = *rg_idx_it;
-        auto const &ck    = chunks[col_idx][rg_idx];
-        auto &strm        = col_streams[rg_idx];
+        auto const& ck    = chunks[col_idx][rg_idx];
+        auto& strm        = col_streams[rg_idx];
 
         for (int strm_type = 0; strm_type < gpu::CI_NUM_STREAMS; ++strm_type) {
           auto const strm_id = streams.id(col_idx * gpu::CI_NUM_STREAMS + strm_type);
@@ -688,7 +688,7 @@ encoded_data writer::impl::encode_columns(const table_device_view &view,
                   if (strm_type == gpu::CI_DATA2 && ck.encoding_kind == DICTIONARY_V2)
                     strm.data_ptrs[strm_type] += stream_offsets.non_rle_data_size;
                 } else {
-                  auto const &strm_up = col_streams[stripe_dict[-dict_stride].start_chunk];
+                  auto const& strm_up = col_streams[stripe_dict[-dict_stride].start_chunk];
                   strm.data_ptrs[strm_type] =
                     strm_up.data_ptrs[strm_type] + strm_up.lengths[strm_type];
                 }
@@ -754,19 +754,19 @@ std::vector<StripeInformation> writer::impl::gather_stripes(
   size_t num_rows,
   size_t num_index_streams,
   host_span<stripe_rowgroups const> stripe_bounds,
-  hostdevice_2dvector<gpu::encoder_chunk_streams> *enc_streams,
-  hostdevice_2dvector<gpu::StripeStream> *strm_desc)
+  hostdevice_2dvector<gpu::encoder_chunk_streams>* enc_streams,
+  hostdevice_2dvector<gpu::StripeStream>* strm_desc)
 {
   std::vector<StripeInformation> stripes(stripe_bounds.size());
-  for (auto const &stripe : stripe_bounds) {
+  for (auto const& stripe : stripe_bounds) {
     for (size_t col_idx = 0; col_idx < enc_streams->size().first; col_idx++) {
-      const auto &strm = (*enc_streams)[col_idx][stripe.first];
+      const auto& strm = (*enc_streams)[col_idx][stripe.first];
 
       // Assign stream data of column data stream(s)
       for (int k = 0; k < gpu::CI_INDEX; k++) {
         const auto stream_id = strm.ids[k];
         if (stream_id != -1) {
-          auto *ss           = &(*strm_desc)[stripe.id][stream_id - num_index_streams];
+          auto* ss           = &(*strm_desc)[stripe.id][stream_id - num_index_streams];
           ss->stream_size    = 0;
           ss->first_chunk_id = stripe.first;
           ss->num_chunks     = stripe.size;
@@ -790,7 +790,7 @@ std::vector<StripeInformation> writer::impl::gather_stripes(
 }
 
 std::vector<std::vector<uint8_t>> writer::impl::gather_statistic_blobs(
-  const table_device_view &table,
+  const table_device_view& table,
   host_span<orc_column_view const> columns,
   host_span<stripe_rowgroups const> stripe_bounds)
 {
@@ -804,8 +804,8 @@ std::vector<std::vector<uint8_t>> writer::impl::gather_statistic_blobs(
   rmm::device_uvector<statistics_chunk> stat_chunks(num_chunks + num_stat_blobs, stream);
   rmm::device_uvector<statistics_group> stat_groups(num_chunks, stream);
 
-  for (auto const &column : columns) {
-    stats_column_desc *desc = &stat_desc[column.index()];
+  for (auto const& column : columns) {
+    stats_column_desc* desc = &stat_desc[column.index()];
     switch (column.orc_kind()) {
       case TypeKind::BYTE: desc->stats_dtype = dtype_int8; break;
       case TypeKind::SHORT: desc->stats_dtype = dtype_int16; break;
@@ -834,13 +834,13 @@ std::vector<std::vector<uint8_t>> writer::impl::gather_statistic_blobs(
     } else {
       desc->ts_scale = 0;
     }
-    for (auto const &stripe : stripe_bounds) {
+    for (auto const& stripe : stripe_bounds) {
       auto grp         = &stat_merge[column.index() * stripe_bounds.size() + stripe.id];
       grp->col         = stat_desc.device_ptr(column.index());
       grp->start_chunk = static_cast<uint32_t>(column.index() * num_rowgroups + stripe.first);
       grp->num_chunks  = stripe.size;
     }
-    statistics_merge_group *col_stats =
+    statistics_merge_group* col_stats =
       &stat_merge[stripe_bounds.size() * columns.size() + column.index()];
     col_stats->col         = stat_desc.device_ptr(column.index());
     col_stats->start_chunk = static_cast<uint32_t>(column.index() * stripe_bounds.size());
@@ -888,8 +888,8 @@ std::vector<std::vector<uint8_t>> writer::impl::gather_statistic_blobs(
   blobs.device_to_host(stream, true);
 
   for (size_t i = 0; i < num_stat_blobs; i++) {
-    const uint8_t *stat_begin = blobs.host_ptr(stat_merge[i].start_chunk);
-    const uint8_t *stat_end   = stat_begin + stat_merge[i].num_chunks;
+    const uint8_t* stat_begin = blobs.host_ptr(stat_merge[i].start_chunk);
+    const uint8_t* stat_end   = stat_begin + stat_merge[i].num_chunks;
     stat_blobs[i].assign(stat_begin, stat_end);
   }
 
@@ -899,13 +899,13 @@ std::vector<std::vector<uint8_t>> writer::impl::gather_statistic_blobs(
 void writer::impl::write_index_stream(int32_t stripe_id,
                                       int32_t stream_id,
                                       host_span<orc_column_view const> columns,
-                                      stripe_rowgroups const &rowgroups_range,
+                                      stripe_rowgroups const& rowgroups_range,
                                       host_2dspan<gpu::encoder_chunk_streams const> enc_streams,
                                       host_2dspan<gpu::StripeStream const> strm_desc,
                                       host_span<gpu_inflate_status_s const> comp_out,
-                                      StripeInformation *stripe,
-                                      orc_streams *streams,
-                                      ProtobufWriter *pbw)
+                                      StripeInformation* stripe,
+                                      orc_streams* streams,
+                                      ProtobufWriter* pbw)
 {
   row_group_index_info present;
   row_group_index_info data;
@@ -913,13 +913,13 @@ void writer::impl::write_index_stream(int32_t stripe_id,
   auto kind            = TypeKind::STRUCT;
   auto const column_id = stream_id - 1;
 
-  auto find_record = [=, &strm_desc](gpu::encoder_chunk_streams const &stream,
+  auto find_record = [=, &strm_desc](gpu::encoder_chunk_streams const& stream,
                                      gpu::StreamIndexType type) {
     row_group_index_info record;
     if (stream.ids[type] > 0) {
       record.pos = 0;
       if (compression_kind_ != NONE) {
-        auto const &ss   = strm_desc[stripe_id][stream.ids[type] - (columns.size() + 1)];
+        auto const& ss   = strm_desc[stripe_id][stream.ids[type] - (columns.size() + 1)];
         record.blk_pos   = ss.first_block;
         record.comp_pos  = 0;
         record.comp_size = ss.stream_size;
@@ -927,9 +927,9 @@ void writer::impl::write_index_stream(int32_t stripe_id,
     }
     return record;
   };
-  auto scan_record = [=, &comp_out](gpu::encoder_chunk_streams const &stream,
+  auto scan_record = [=, &comp_out](gpu::encoder_chunk_streams const& stream,
                                     gpu::StreamIndexType type,
-                                    row_group_index_info &record) {
+                                    row_group_index_info& record) {
     if (record.pos >= 0) {
       record.pos += stream.lengths[type];
       while ((record.pos >= 0) && (record.blk_pos >= 0) &&
@@ -945,7 +945,7 @@ void writer::impl::write_index_stream(int32_t stripe_id,
 
   // TBD: Not sure we need an empty index stream for column 0
   if (stream_id != 0) {
-    const auto &strm = enc_streams[column_id][0];
+    const auto& strm = enc_streams[column_id][0];
     present          = find_record(strm, gpu::CI_PRESENT);
     data             = find_record(strm, gpu::CI_DATA);
     data2            = find_record(strm, gpu::CI_DATA2);
@@ -965,7 +965,7 @@ void writer::impl::write_index_stream(int32_t stripe_id,
       present.comp_pos, present.pos, data.comp_pos, data.pos, data2.comp_pos, data2.pos, kind);
 
     if (stream_id != 0) {
-      const auto &strm = enc_streams[column_id][rowgroup];
+      const auto& strm = enc_streams[column_id][rowgroup];
       scan_record(strm, gpu::CI_PRESENT, present);
       scan_record(strm, gpu::CI_DATA, data);
       scan_record(strm, gpu::CI_DATA2, data2);
@@ -983,18 +983,18 @@ void writer::impl::write_index_stream(int32_t stripe_id,
   stripe->indexLength += buffer_.size();
 }
 
-void writer::impl::write_data_stream(gpu::StripeStream const &strm_desc,
-                                     gpu::encoder_chunk_streams const &enc_stream,
-                                     uint8_t const *compressed_data,
-                                     uint8_t *stream_out,
-                                     StripeInformation *stripe,
-                                     orc_streams *streams)
+void writer::impl::write_data_stream(gpu::StripeStream const& strm_desc,
+                                     gpu::encoder_chunk_streams const& enc_stream,
+                                     uint8_t const* compressed_data,
+                                     uint8_t* stream_out,
+                                     StripeInformation* stripe,
+                                     orc_streams* streams)
 {
   const auto length                                        = strm_desc.stream_size;
   (*streams)[enc_stream.ids[strm_desc.stream_type]].length = length;
   if (length == 0) { return; }
 
-  const auto *stream_in = (compression_kind_ == NONE) ? enc_stream.data_ptrs[strm_desc.stream_type]
+  const auto* stream_in = (compression_kind_ == NONE) ? enc_stream.data_ptrs[strm_desc.stream_type]
                                                       : (compressed_data + strm_desc.bfr_offset);
 
   if (out_sink_->is_device_write_preferred(length)) {
@@ -1009,7 +1009,7 @@ void writer::impl::write_data_stream(gpu::StripeStream const &strm_desc,
   stripe->dataLength += length;
 }
 
-void writer::impl::add_uncompressed_block_headers(std::vector<uint8_t> &v)
+void writer::impl::add_uncompressed_block_headers(std::vector<uint8_t>& v)
 {
   if (compression_kind_ != NONE) {
     size_t uncomp_len = v.size() - 3, pos = 0, block_len;
@@ -1030,10 +1030,10 @@ void writer::impl::add_uncompressed_block_headers(std::vector<uint8_t> &v)
 }
 
 writer::impl::impl(std::unique_ptr<data_sink> sink,
-                   orc_writer_options const &options,
+                   orc_writer_options const& options,
                    SingleWriteMode mode,
                    rmm::cuda_stream_view stream,
-                   rmm::mr::device_memory_resource *mr)
+                   rmm::mr::device_memory_resource* mr)
   : compression_kind_(to_orc_compression(options.get_compression())),
     enable_statistics_(options.enable_statistics()),
     out_sink_(std::move(sink)),
@@ -1046,10 +1046,10 @@ writer::impl::impl(std::unique_ptr<data_sink> sink,
 }
 
 writer::impl::impl(std::unique_ptr<data_sink> sink,
-                   chunked_orc_writer_options const &options,
+                   chunked_orc_writer_options const& options,
                    SingleWriteMode mode,
                    rmm::cuda_stream_view stream,
-                   rmm::mr::device_memory_resource *mr)
+                   rmm::mr::device_memory_resource* mr)
   : compression_kind_(to_orc_compression(options.get_compression())),
     enable_statistics_(options.enable_statistics()),
     out_sink_(std::move(sink)),
@@ -1073,7 +1073,7 @@ void writer::impl::init_state()
   out_sink_->host_write(MAGIC, std::strlen(MAGIC));
 }
 
-rmm::device_uvector<size_type> get_string_column_ids(const table_device_view &view,
+rmm::device_uvector<size_type> get_string_column_ids(const table_device_view& view,
                                                      rmm::cuda_stream_view stream)
 {
   rmm::device_uvector<size_type> string_column_ids(view.num_columns(), stream);
@@ -1096,8 +1096,8 @@ rmm::device_uvector<size_type> get_string_column_ids(const table_device_view &vi
 struct rowgroup_iterator {
   using difference_type   = long;
   using value_type        = int;
-  using pointer           = int *;
-  using reference         = int &;
+  using pointer           = int*;
+  using reference         = int&;
   using iterator_category = thrust::output_device_iterator_tag;
   size_type idx;
   size_type rowgroup_size;
@@ -1111,7 +1111,7 @@ struct rowgroup_iterator {
   {
     return rowgroup_iterator{idx + i, rowgroup_size};
   }
-  CUDA_HOST_DEVICE_CALLABLE rowgroup_iterator &operator++()
+  CUDA_HOST_DEVICE_CALLABLE rowgroup_iterator& operator++()
   {
     ++idx;
     return *this;
@@ -1120,14 +1120,14 @@ struct rowgroup_iterator {
   {
     return (idx + offset) / rowgroup_size;
   }
-  CUDA_HOST_DEVICE_CALLABLE bool operator!=(rowgroup_iterator const &other)
+  CUDA_HOST_DEVICE_CALLABLE bool operator!=(rowgroup_iterator const& other)
   {
     return idx != other.idx;
   }
 };
 
 // returns host vector of per-rowgroup sizes
-encoder_decimal_info decimal_chunk_sizes(table_view const &table,
+encoder_decimal_info decimal_chunk_sizes(table_view const& table,
                                          host_span<orc_column_view> orc_columns,
                                          size_type rowgroup_size,
                                          host_span<stripe_rowgroups const> stripes,
@@ -1138,21 +1138,21 @@ encoder_decimal_info decimal_chunk_sizes(table_view const &table,
   auto const d_table = table_device_view::create(table, stream);
   // Compute per-element offsets (within each row group) on the device
   for (size_t col_idx = 0; col_idx < orc_columns.size(); ++col_idx) {
-    auto &orc_col = orc_columns[col_idx];
+    auto& orc_col = orc_columns[col_idx];
     if (orc_col.orc_kind() == DECIMAL) {
-      auto const &col = table.column(col_idx);
-      auto &current_sizes =
+      auto const& col = table.column(col_idx);
+      auto& current_sizes =
         elem_sizes.insert({col_idx, rmm::device_uvector<uint32_t>(col.size(), stream)})
           .first->second;
       thrust::tabulate(rmm::exec_policy(stream),
                        current_sizes.begin(),
                        current_sizes.end(),
                        [table = *d_table, col_idx] __device__(auto idx) {
-                         auto const &col = table.column(col_idx);
+                         auto const& col = table.column(col_idx);
                          if (col.is_null(idx)) return 0u;
-                         int64_t const element = (col.type().id() == type_id::DECIMAL32)
-                                                   ? col.element<int32_t>(idx)
-                                                   : col.element<int64_t>(idx);
+                         int64_t const element   = (col.type().id() == type_id::DECIMAL32)
+                                                     ? col.element<int32_t>(idx)
+                                                     : col.element<int64_t>(idx);
                          int64_t const sign      = (element < 0) ? 1 : 0;
                          uint64_t zigzaged_value = ((element ^ -sign) * 2) + sign;
 
@@ -1180,7 +1180,7 @@ encoder_decimal_info decimal_chunk_sizes(table_view const &table,
   auto const num_rowgroups  = stripes_size(stripes);
   auto d_tmp_rowgroup_sizes = rmm::device_uvector<uint32_t>(num_rowgroups, stream);
   std::map<uint32_t, std::vector<uint32_t>> rg_sizes;
-  for (auto const &[col_idx, esizes] : elem_sizes) {
+  for (auto const& [col_idx, esizes] : elem_sizes) {
     // Copy last elem in each row group - equal to row group size
     thrust::tabulate(
       rmm::exec_policy(stream),
@@ -1196,13 +1196,13 @@ encoder_decimal_info decimal_chunk_sizes(table_view const &table,
 }
 
 std::map<uint32_t, size_t> decimal_column_sizes(
-  std::map<uint32_t, std::vector<uint32_t>> const &chunk_sizes)
+  std::map<uint32_t, std::vector<uint32_t>> const& chunk_sizes)
 {
   std::map<uint32_t, size_t> column_sizes;
   std::transform(chunk_sizes.cbegin(),
                  chunk_sizes.cend(),
                  std::inserter(column_sizes, column_sizes.end()),
-                 [](auto const &chunk_size) -> std::pair<uint32_t, size_t> {
+                 [](auto const& chunk_size) -> std::pair<uint32_t, size_t> {
                    return {
                      chunk_size.first,
                      std::accumulate(chunk_size.second.cbegin(), chunk_size.second.cend(), 0lu)};
@@ -1210,7 +1210,7 @@ std::map<uint32_t, size_t> decimal_column_sizes(
   return column_sizes;
 }
 
-void writer::impl::write(table_view const &table)
+void writer::impl::write(table_view const& table)
 {
   CUDF_EXPECTS(not closed, "Data has already been flushed to out and closed");
   auto const num_columns = table.num_columns();
@@ -1231,7 +1231,7 @@ void writer::impl::write(table_view const &table)
   orc_columns.reserve(num_columns);
   // Mapping of string columns for quick look-up
   std::vector<int> str_col_ids;
-  for (auto const &column : table) {
+  for (auto const& column : table) {
     auto const current_id     = orc_columns.size();
     auto const current_str_id = str_col_ids.size();
 
@@ -1302,7 +1302,7 @@ void writer::impl::write(table_view const &table)
 
     for (size_t stripe_id = 0; stripe_id < stripe_bounds.size(); stripe_id++) {
       for (size_t i = 0; i < num_data_streams; i++) {  // TODO range for (at least)
-        gpu::StripeStream *ss = &strm_descs[stripe_id][i];
+        gpu::StripeStream* ss = &strm_descs[stripe_id][i];
         if (!out_sink_->is_device_write_preferred(ss->stream_size)) { all_device_write = false; }
         size_t stream_size = ss->stream_size;
         if (compression_kind_ != NONE) {
@@ -1323,7 +1323,7 @@ void writer::impl::write(table_view const &table)
       return pinned_buffer<uint8_t>{nullptr, cudaFreeHost};
     } else {
       return pinned_buffer<uint8_t>{[](size_t size) {
-                                      uint8_t *ptr = nullptr;
+                                      uint8_t* ptr = nullptr;
                                       CUDA_TRY(cudaMallocHost(&ptr, size));
                                       return ptr;
                                     }(max_stream_size),
@@ -1337,7 +1337,7 @@ void writer::impl::write(table_view const &table)
   hostdevice_vector<gpu_inflate_input_s> comp_in(num_compressed_blocks, stream);
   if (compression_kind_ != NONE) {
     strm_descs.host_to_device(stream);
-    gpu::CompressOrcDataStreams(static_cast<uint8_t *>(compressed_data.data()),
+    gpu::CompressOrcDataStreams(static_cast<uint8_t*>(compressed_data.data()),
                                 num_compressed_blocks,
                                 compression_kind_,
                                 compression_blocksize_,
@@ -1354,8 +1354,8 @@ void writer::impl::write(table_view const &table)
 
   // Write stripes
   for (size_t stripe_id = 0; stripe_id < stripes.size(); ++stripe_id) {
-    auto const &rowgroup_range = stripe_bounds[stripe_id];
-    auto &stripe               = stripes[stripe_id];
+    auto const& rowgroup_range = stripe_bounds[stripe_id];
+    auto& stripe               = stripes[stripe_id];
 
     stripe.offset = out_sink_->bytes_written();
 
@@ -1374,10 +1374,10 @@ void writer::impl::write(table_view const &table)
     }
 
     // Column data consisting one or more separate streams
-    for (auto const &strm_desc : strm_descs[stripe_id]) {
+    for (auto const& strm_desc : strm_descs[stripe_id]) {
       write_data_stream(strm_desc,
                         enc_data.streams[strm_desc.column_id][rowgroup_range.first],
-                        static_cast<uint8_t *>(compressed_data.data()),
+                        static_cast<uint8_t*>(compressed_data.data()),
                         stream_output.get(),
                         &stripe,
                         &streams);
@@ -1450,7 +1450,7 @@ void writer::impl::write(table_view const &table)
     ff.types[0].kind = STRUCT;
     ff.types[0].subtypes.resize(num_columns);
     ff.types[0].fieldNames.resize(num_columns);
-    for (auto const &column : orc_columns) {
+    for (auto const& column : orc_columns) {
       ff.types[column.id()].kind = column.orc_kind();
       if (column.orc_kind() == DECIMAL) {
         ff.types[column.id()].scale     = static_cast<uint32_t>(column.scale());
@@ -1465,7 +1465,7 @@ void writer::impl::write(table_view const &table)
                  "Mismatch in table structure between multiple calls to write");
     CUDF_EXPECTS(std::all_of(orc_columns.cbegin(),
                              orc_columns.cend(),
-                             [&](auto const &col) {
+                             [&](auto const& col) {
                                return ff.types[1 + col.index()].kind == col.orc_kind();
                              }),
                  "Mismatch in column types between multiple calls to write");
@@ -1517,20 +1517,20 @@ void writer::impl::close()
 
 // Forward to implementation
 writer::writer(std::unique_ptr<data_sink> sink,
-               orc_writer_options const &options,
+               orc_writer_options const& options,
                SingleWriteMode mode,
                rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource *mr)
+               rmm::mr::device_memory_resource* mr)
   : _impl(std::make_unique<impl>(std::move(sink), options, mode, stream, mr))
 {
 }
 
 // Forward to implementation
 writer::writer(std::unique_ptr<data_sink> sink,
-               chunked_orc_writer_options const &options,
+               chunked_orc_writer_options const& options,
                SingleWriteMode mode,
                rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource *mr)
+               rmm::mr::device_memory_resource* mr)
   : _impl(std::make_unique<impl>(std::move(sink), options, mode, stream, mr))
 {
 }
@@ -1539,7 +1539,7 @@ writer::writer(std::unique_ptr<data_sink> sink,
 writer::~writer() = default;
 
 // Forward to implementation
-void writer::write(table_view const &table) { _impl->write(table); }
+void writer::write(table_view const& table) { _impl->write(table); }
 
 // Forward to implementation
 void writer::close() { _impl->close(); }
diff --git a/cpp/src/io/orc/writer_impl.hpp b/cpp/src/io/orc/writer_impl.hpp
index 155c83a88d9..db5cd349198 100644
--- a/cpp/src/io/orc/writer_impl.hpp
+++ b/cpp/src/io/orc/writer_impl.hpp
@@ -74,7 +74,7 @@ struct encoder_decimal_info {
 };
 
 /**
- * @brief Returns the total number of rowgroups in the list of contigious stripes.
+ * @brief Returns the total number of rowgroups in the list of contiguous stripes.
  */
 inline auto stripes_size(host_span<stripe_rowgroups const> stripes)
 {
@@ -110,7 +110,7 @@ class orc_streams {
   orc_stream_offsets compute_offsets(host_span<orc_column_view const> columns,
                                      size_t num_rowgroups) const;
 
-  operator std::vector<Stream> const&() const { return streams; }
+  operator std::vector<Stream> const &() const { return streams; }
 
  private:
   std::vector<Stream> streams;
diff --git a/cpp/src/io/parquet/compact_protocol_writer.cpp b/cpp/src/io/parquet/compact_protocol_writer.cpp
index a9b8eb0ac6b..dde86af68c8 100644
--- a/cpp/src/io/parquet/compact_protocol_writer.cpp
+++ b/cpp/src/io/parquet/compact_protocol_writer.cpp
@@ -24,7 +24,7 @@ namespace parquet {
  * @Brief Parquet CompactProtocolWriter class
  */
 
-size_t CompactProtocolWriter::write(const FileMetaData &f)
+size_t CompactProtocolWriter::write(const FileMetaData& f)
 {
   CompactProtocolFieldWriter c(*this);
   c.field_int(1, f.version);
@@ -48,7 +48,7 @@ size_t CompactProtocolWriter::write(const FileMetaData &f)
   return c.value();
 }
 
-size_t CompactProtocolWriter::write(const SchemaElement &s)
+size_t CompactProtocolWriter::write(const SchemaElement& s)
 {
   CompactProtocolFieldWriter c(*this);
   if (s.type != UNDEFINED_TYPE) {
@@ -69,7 +69,7 @@ size_t CompactProtocolWriter::write(const SchemaElement &s)
   return c.value();
 }
 
-size_t CompactProtocolWriter::write(const RowGroup &r)
+size_t CompactProtocolWriter::write(const RowGroup& r)
 {
   CompactProtocolFieldWriter c(*this);
   c.field_struct_list(1, r.columns);
@@ -78,7 +78,7 @@ size_t CompactProtocolWriter::write(const RowGroup &r)
   return c.value();
 }
 
-size_t CompactProtocolWriter::write(const KeyValue &k)
+size_t CompactProtocolWriter::write(const KeyValue& k)
 {
   CompactProtocolFieldWriter c(*this);
   c.field_string(1, k.key);
@@ -86,7 +86,7 @@ size_t CompactProtocolWriter::write(const KeyValue &k)
   return c.value();
 }
 
-size_t CompactProtocolWriter::write(const ColumnChunk &s)
+size_t CompactProtocolWriter::write(const ColumnChunk& s)
 {
   CompactProtocolFieldWriter c(*this);
   if (s.file_path.size() != 0) { c.field_string(1, s.file_path); }
@@ -103,7 +103,7 @@ size_t CompactProtocolWriter::write(const ColumnChunk &s)
   return c.value();
 }
 
-size_t CompactProtocolWriter::write(const ColumnChunkMetaData &s)
+size_t CompactProtocolWriter::write(const ColumnChunkMetaData& s)
 {
   CompactProtocolFieldWriter c(*this);
   c.field_int(1, s.type);
@@ -122,9 +122,10 @@ size_t CompactProtocolWriter::write(const ColumnChunkMetaData &s)
 
 void CompactProtocolFieldWriter::put_byte(uint8_t v) { writer.m_buf.push_back(v); }
 
-void CompactProtocolFieldWriter::put_byte(const uint8_t *raw, uint32_t len)
+void CompactProtocolFieldWriter::put_byte(const uint8_t* raw, uint32_t len)
 {
-  for (uint32_t i = 0; i < len; i++) writer.m_buf.push_back(raw[i]);
+  for (uint32_t i = 0; i < len; i++)
+    writer.m_buf.push_back(raw[i]);
 }
 
 uint32_t CompactProtocolFieldWriter::put_uint(uint64_t v)
@@ -170,17 +171,19 @@ inline void CompactProtocolFieldWriter::field_int(int field, int64_t val)
 }
 
 template <typename Enum>
-inline void CompactProtocolFieldWriter::field_int_list(int field, const std::vector<Enum> &val)
+inline void CompactProtocolFieldWriter::field_int_list(int field, const std::vector<Enum>& val)
 {
   put_field_header(field, current_field_value, ST_FLD_LIST);
   put_byte((uint8_t)((std::min(val.size(), (size_t)0xfu) << 4) | ST_FLD_I32));
   if (val.size() >= 0xf) put_uint(val.size());
-  for (auto &v : val) { put_int(static_cast<int32_t>(v)); }
+  for (auto& v : val) {
+    put_int(static_cast<int32_t>(v));
+  }
   current_field_value = field;
 }
 
 template <typename T>
-inline void CompactProtocolFieldWriter::field_struct(int field, const T &val)
+inline void CompactProtocolFieldWriter::field_struct(int field, const T& val)
 {
   put_field_header(field, current_field_value, ST_FLD_STRUCT);
   writer.write(val);
@@ -188,12 +191,14 @@ inline void CompactProtocolFieldWriter::field_struct(int field, const T &val)
 }
 
 template <typename T>
-inline void CompactProtocolFieldWriter::field_struct_list(int field, const std::vector<T> &val)
+inline void CompactProtocolFieldWriter::field_struct_list(int field, const std::vector<T>& val)
 {
   put_field_header(field, current_field_value, ST_FLD_LIST);
   put_byte((uint8_t)((std::min(val.size(), (size_t)0xfu) << 4) | ST_FLD_STRUCT));
   if (val.size() >= 0xf) put_uint(val.size());
-  for (auto &v : val) { writer.write(v); }
+  for (auto& v : val) {
+    writer.write(v);
+  }
   current_field_value = field;
 }
 
@@ -204,7 +209,7 @@ inline size_t CompactProtocolFieldWriter::value()
 }
 
 inline void CompactProtocolFieldWriter::field_struct_blob(int field,
-                                                          const std::vector<uint8_t> &val)
+                                                          const std::vector<uint8_t>& val)
 {
   put_field_header(field, current_field_value, ST_FLD_STRUCT);
   put_byte(val.data(), (uint32_t)val.size());
@@ -212,32 +217,32 @@ inline void CompactProtocolFieldWriter::field_struct_blob(int field,
   current_field_value = field;
 }
 
-inline void CompactProtocolFieldWriter::field_string(int field, const std::string &val)
+inline void CompactProtocolFieldWriter::field_string(int field, const std::string& val)
 {
   put_field_header(field, current_field_value, ST_FLD_BINARY);
   put_uint(val.size());
   // FIXME : replace reinterpret_cast
-  put_byte(reinterpret_cast<const uint8_t *>(val.data()), (uint32_t)val.size());
+  put_byte(reinterpret_cast<const uint8_t*>(val.data()), (uint32_t)val.size());
   current_field_value = field;
 }
 
 inline void CompactProtocolFieldWriter::field_string_list(int field,
-                                                          const std::vector<std::string> &val)
+                                                          const std::vector<std::string>& val)
 {
   put_field_header(field, current_field_value, ST_FLD_LIST);
   put_byte((uint8_t)((std::min(val.size(), (size_t)0xfu) << 4) | ST_FLD_BINARY));
   if (val.size() >= 0xf) put_uint(val.size());
-  for (auto &v : val) {
+  for (auto& v : val) {
     put_uint(v.size());
     // FIXME : replace reinterpret_cast
-    put_byte(reinterpret_cast<const uint8_t *>(v.data()), (uint32_t)v.size());
+    put_byte(reinterpret_cast<const uint8_t*>(v.data()), (uint32_t)v.size());
   }
   current_field_value = field;
 }
 
 inline int CompactProtocolFieldWriter::current_field() { return current_field_value; }
 
-inline void CompactProtocolFieldWriter::set_current_field(const int &field)
+inline void CompactProtocolFieldWriter::set_current_field(const int& field)
 {
   current_field_value = field;
 }
diff --git a/cpp/src/io/parquet/compact_protocol_writer.hpp b/cpp/src/io/parquet/compact_protocol_writer.hpp
index 2ce9245490e..633bbdf1e19 100644
--- a/cpp/src/io/parquet/compact_protocol_writer.hpp
+++ b/cpp/src/io/parquet/compact_protocol_writer.hpp
@@ -36,34 +36,34 @@ namespace parquet {
  */
 class CompactProtocolWriter {
  public:
-  CompactProtocolWriter(std::vector<uint8_t> *output) : m_buf(*output) {}
+  CompactProtocolWriter(std::vector<uint8_t>* output) : m_buf(*output) {}
 
-  size_t write(const FileMetaData &);
-  size_t write(const SchemaElement &);
-  size_t write(const RowGroup &);
-  size_t write(const KeyValue &);
-  size_t write(const ColumnChunk &);
-  size_t write(const ColumnChunkMetaData &);
+  size_t write(const FileMetaData&);
+  size_t write(const SchemaElement&);
+  size_t write(const RowGroup&);
+  size_t write(const KeyValue&);
+  size_t write(const ColumnChunk&);
+  size_t write(const ColumnChunkMetaData&);
 
  protected:
-  std::vector<uint8_t> &m_buf;
+  std::vector<uint8_t>& m_buf;
   friend class CompactProtocolFieldWriter;
 };
 
 class CompactProtocolFieldWriter {
-  CompactProtocolWriter &writer;
+  CompactProtocolWriter& writer;
   size_t struct_start_pos;
   int current_field_value;
 
  public:
-  CompactProtocolFieldWriter(CompactProtocolWriter &caller)
+  CompactProtocolFieldWriter(CompactProtocolWriter& caller)
     : writer(caller), struct_start_pos(writer.m_buf.size()), current_field_value(0)
   {
   }
 
   void put_byte(uint8_t v);
 
-  void put_byte(const uint8_t *raw, uint32_t len);
+  void put_byte(const uint8_t* raw, uint32_t len);
 
   uint32_t put_uint(uint64_t v);
 
@@ -76,25 +76,25 @@ class CompactProtocolFieldWriter {
   inline void field_int(int field, int64_t val);
 
   template <typename Enum>
-  inline void field_int_list(int field, const std::vector<Enum> &val);
+  inline void field_int_list(int field, const std::vector<Enum>& val);
 
   template <typename T>
-  inline void field_struct(int field, const T &val);
+  inline void field_struct(int field, const T& val);
 
   template <typename T>
-  inline void field_struct_list(int field, const std::vector<T> &val);
+  inline void field_struct_list(int field, const std::vector<T>& val);
 
   inline size_t value();
 
-  inline void field_struct_blob(int field, const std::vector<uint8_t> &val);
+  inline void field_struct_blob(int field, const std::vector<uint8_t>& val);
 
-  inline void field_string(int field, const std::string &val);
+  inline void field_string(int field, const std::string& val);
 
-  inline void field_string_list(int field, const std::vector<std::string> &val);
+  inline void field_string_list(int field, const std::vector<std::string>& val);
 
   inline int current_field();
 
-  inline void set_current_field(const int &field);
+  inline void set_current_field(const int& field);
 };
 
 }  // namespace parquet
diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index dfd9c1384c5..f8158eaa6e9 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -45,10 +45,10 @@ namespace parquet {
 namespace gpu {
 
 struct page_state_s {
-  const uint8_t *data_start;
-  const uint8_t *data_end;
-  const uint8_t *lvl_end;
-  const uint8_t *dict_base;    // ptr to dictionary page data
+  const uint8_t* data_start;
+  const uint8_t* data_end;
+  const uint8_t* lvl_end;
+  const uint8_t* dict_base;    // ptr to dictionary page data
   int32_t dict_size;           // size of dictionary data
   int32_t first_row;           // First row in page to output
   int32_t num_rows;            // Rows in page to decode (including rows to be skipped)
@@ -80,7 +80,7 @@ struct page_state_s {
   int32_t input_leaf_count;                   // how many leaf values of the input we've processed
   uint32_t rep[non_zero_buffer_size];         // circular buffer of repetition level values
   uint32_t def[non_zero_buffer_size];         // circular buffer of definition level values
-  const uint8_t *lvl_start[NUM_LEVEL_TYPES];  // [def,rep]
+  const uint8_t* lvl_start[NUM_LEVEL_TYPES];  // [def,rep]
   int32_t lvl_count[NUM_LEVEL_TYPES];         // how many of each of the streams we've decoded
   int32_t row_index_lower_bound;              // lower bound of row indices we should process
 };
@@ -100,9 +100,9 @@ struct page_state_s {
  *
  * @return The hash value
  */
-__device__ uint32_t device_str2hash32(const char *key, size_t len, uint32_t seed = 33)
+__device__ uint32_t device_str2hash32(const char* key, size_t len, uint32_t seed = 33)
 {
-  const uint8_t *p  = reinterpret_cast<const uint8_t *>(key);
+  const uint8_t* p  = reinterpret_cast<const uint8_t*>(key);
   uint32_t h1       = seed, k1;
   const uint32_t c1 = 0xcc9e2d51;
   const uint32_t c2 = 0x1b873593;
@@ -149,7 +149,7 @@ __device__ uint32_t device_str2hash32(const char *key, size_t len, uint32_t seed
  *
  * @return The 32-bit value read
  */
-inline __device__ uint32_t get_vlq32(const uint8_t *&cur, const uint8_t *end)
+inline __device__ uint32_t get_vlq32(const uint8_t*& cur, const uint8_t* end)
 {
   uint32_t v = *cur++;
   if (v >= 0x80 && cur < end) {
@@ -178,9 +178,9 @@ inline __device__ uint32_t get_vlq32(const uint8_t *&cur, const uint8_t *end)
  *
  * @return The length of the section
  */
-__device__ uint32_t InitLevelSection(page_state_s *s,
-                                     const uint8_t *cur,
-                                     const uint8_t *end,
+__device__ uint32_t InitLevelSection(page_state_s* s,
+                                     const uint8_t* cur,
+                                     const uint8_t* end,
                                      level_type lvl)
 {
   int32_t len;
@@ -236,10 +236,10 @@ __device__ uint32_t InitLevelSection(page_state_s *s,
  * @param[in] lvl The level type we are decoding - DEFINITION or REPETITION
  */
 __device__ void gpuDecodeStream(
-  uint32_t *output, page_state_s *s, int32_t target_count, int t, level_type lvl)
+  uint32_t* output, page_state_s* s, int32_t target_count, int t, level_type lvl)
 {
-  const uint8_t *cur_def    = s->lvl_start[lvl];
-  const uint8_t *end        = s->lvl_end;
+  const uint8_t* cur_def    = s->lvl_start[lvl];
+  const uint8_t* end        = s->lvl_end;
   uint32_t level_run        = s->initial_rle_run[lvl];
   int32_t level_val         = s->initial_rle_value[lvl];
   int level_bits            = s->col.level_bits[lvl];
@@ -253,7 +253,7 @@ __device__ void gpuDecodeStream(
       // Get a new run symbol from the byte stream
       int sym_len = 0;
       if (!t) {
-        const uint8_t *cur = cur_def;
+        const uint8_t* cur = cur_def;
         if (cur < end) { level_run = get_vlq32(cur, end); }
         if (!(level_run & 1)) {
           if (cur < end) level_val = cur[0];
@@ -282,7 +282,7 @@ __device__ void gpuDecodeStream(
       batch_len8 = (batch_len + 7) >> 3;
       if (t < batch_len) {
         int bitpos         = t * level_bits;
-        const uint8_t *cur = cur_def + (bitpos >> 3);
+        const uint8_t* cur = cur_def + (bitpos >> 3);
         bitpos &= 7;
         if (cur < end) level_val = cur[0];
         cur++;
@@ -327,9 +327,9 @@ __device__ void gpuDecodeStream(
  *
  * @return The new output position
  */
-__device__ int gpuDecodeDictionaryIndices(volatile page_state_s *s, int target_pos, int t)
+__device__ int gpuDecodeDictionaryIndices(volatile page_state_s* s, int target_pos, int t)
 {
-  const uint8_t *end = s->data_end;
+  const uint8_t* end = s->data_end;
   int dict_bits      = s->dict_bits;
   int pos            = s->dict_pos;
 
@@ -337,7 +337,7 @@ __device__ int gpuDecodeDictionaryIndices(volatile page_state_s *s, int target_p
     int is_literal, batch_len;
     if (!t) {
       uint32_t run       = s->dict_run;
-      const uint8_t *cur = s->data_start;
+      const uint8_t* cur = s->data_start;
       if (run <= 1) {
         run = (cur < end) ? get_vlq32(cur, end) : 0;
         if (!(run & 1)) {
@@ -380,7 +380,7 @@ __device__ int gpuDecodeDictionaryIndices(volatile page_state_s *s, int target_p
       int dict_idx = s->dict_val;
       if (is_literal) {
         int32_t ofs      = (t - ((batch_len + 7) & ~7)) * dict_bits;
-        const uint8_t *p = s->data_start + (ofs >> 3);
+        const uint8_t* p = s->data_start + (ofs >> 3);
         ofs &= 7;
         if (p < end) {
           uint32_t c = 8 - ofs;
@@ -413,16 +413,16 @@ __device__ int gpuDecodeDictionaryIndices(volatile page_state_s *s, int target_p
  *
  * @return The new output position
  */
-__device__ int gpuDecodeRleBooleans(volatile page_state_s *s, int target_pos, int t)
+__device__ int gpuDecodeRleBooleans(volatile page_state_s* s, int target_pos, int t)
 {
-  const uint8_t *end = s->data_end;
+  const uint8_t* end = s->data_end;
   int pos            = s->dict_pos;
 
   while (pos < target_pos) {
     int is_literal, batch_len;
     if (!t) {
       uint32_t run       = s->dict_run;
-      const uint8_t *cur = s->data_start;
+      const uint8_t* cur = s->data_start;
       if (run <= 1) {
         run = (cur < end) ? get_vlq32(cur, end) : 0;
         if (!(run & 1)) {
@@ -455,7 +455,7 @@ __device__ int gpuDecodeRleBooleans(volatile page_state_s *s, int target_pos, in
       int dict_idx;
       if (is_literal) {
         int32_t ofs      = t - ((batch_len + 7) & ~7);
-        const uint8_t *p = s->data_start + (ofs >> 3);
+        const uint8_t* p = s->data_start + (ofs >> 3);
         dict_idx         = (p < end) ? (p[0] >> (ofs & 7u)) & 1 : 0;
       } else {
         dict_idx = s->dict_val;
@@ -476,12 +476,12 @@ __device__ int gpuDecodeRleBooleans(volatile page_state_s *s, int target_pos, in
  *
  * @return The new output position
  */
-__device__ void gpuInitStringDescriptors(volatile page_state_s *s, int target_pos, int t)
+__device__ void gpuInitStringDescriptors(volatile page_state_s* s, int target_pos, int t)
 {
   int pos = s->dict_pos;
   // This step is purely serial
   if (!t) {
-    const uint8_t *cur = s->data_start;
+    const uint8_t* cur = s->data_start;
     int dict_size      = s->dict_size;
     int k              = s->dict_val;
 
@@ -511,9 +511,9 @@ __device__ void gpuInitStringDescriptors(volatile page_state_s *s, int target_po
  * @param[in] src_pos Source position
  * @param[in] dstv Pointer to row output data (string descriptor or 32-bit hash)
  */
-inline __device__ void gpuOutputString(volatile page_state_s *s, int src_pos, void *dstv)
+inline __device__ void gpuOutputString(volatile page_state_s* s, int src_pos, void* dstv)
 {
-  const char *ptr = NULL;
+  const char* ptr = NULL;
   size_t len      = 0;
 
   if (s->dict_base) {
@@ -522,8 +522,8 @@ inline __device__ void gpuOutputString(volatile page_state_s *s, int src_pos, vo
                                                sizeof(string_index_pair)
                                            : 0;
     if (dict_pos < (uint32_t)s->dict_size) {
-      const string_index_pair *src =
-        reinterpret_cast<const string_index_pair *>(s->dict_base + dict_pos);
+      const string_index_pair* src =
+        reinterpret_cast<const string_index_pair*>(s->dict_base + dict_pos);
       ptr = src->first;
       len = src->second;
     }
@@ -531,16 +531,16 @@ inline __device__ void gpuOutputString(volatile page_state_s *s, int src_pos, vo
     // Plain encoding
     uint32_t dict_pos = s->dict_idx[src_pos & (non_zero_buffer_size - 1)];
     if (dict_pos <= (uint32_t)s->dict_size) {
-      ptr = reinterpret_cast<const char *>(s->data_start + dict_pos);
+      ptr = reinterpret_cast<const char*>(s->data_start + dict_pos);
       len = s->str_len[src_pos & (non_zero_buffer_size - 1)];
     }
   }
   if (s->dtype_len == 4) {
     // Output hash
-    *static_cast<uint32_t *>(dstv) = device_str2hash32(ptr, len);
+    *static_cast<uint32_t*>(dstv) = device_str2hash32(ptr, len);
   } else {
     // Output string descriptor
-    string_index_pair *dst = static_cast<string_index_pair *>(dstv);
+    string_index_pair* dst = static_cast<string_index_pair*>(dstv);
     dst->first             = ptr;
     dst->second            = len;
   }
@@ -553,7 +553,7 @@ inline __device__ void gpuOutputString(volatile page_state_s *s, int src_pos, vo
  * @param[in] src_pos Source position
  * @param[in] dst Pointer to row output data
  */
-inline __device__ void gpuOutputBoolean(volatile page_state_s *s, int src_pos, uint8_t *dst)
+inline __device__ void gpuOutputBoolean(volatile page_state_s* s, int src_pos, uint8_t* dst)
 {
   *dst = s->dict_idx[src_pos & (non_zero_buffer_size - 1)];
 }
@@ -566,8 +566,8 @@ inline __device__ void gpuOutputBoolean(volatile page_state_s *s, int src_pos, u
  * @param[in] dict_pos byte position in dictionary
  * @param[in] dict_size size of dictionary
  */
-inline __device__ void gpuStoreOutput(uint32_t *dst,
-                                      const uint8_t *src8,
+inline __device__ void gpuStoreOutput(uint32_t* dst,
+                                      const uint8_t* src8,
                                       uint32_t dict_pos,
                                       uint32_t dict_size)
 {
@@ -576,9 +576,9 @@ inline __device__ void gpuStoreOutput(uint32_t *dst,
   src8 -= ofs;  // align to 32-bit boundary
   ofs <<= 3;    // bytes -> bits
   if (dict_pos < dict_size) {
-    bytebuf = *reinterpret_cast<const uint32_t *>(src8 + dict_pos);
+    bytebuf = *reinterpret_cast<const uint32_t*>(src8 + dict_pos);
     if (ofs) {
-      uint32_t bytebufnext = *reinterpret_cast<const uint32_t *>(src8 + dict_pos + 4);
+      uint32_t bytebufnext = *reinterpret_cast<const uint32_t*>(src8 + dict_pos + 4);
       bytebuf              = __funnelshift_r(bytebuf, bytebufnext, ofs);
     }
   } else {
@@ -595,8 +595,8 @@ inline __device__ void gpuStoreOutput(uint32_t *dst,
  * @param[in] dict_pos byte position in dictionary
  * @param[in] dict_size size of dictionary
  */
-inline __device__ void gpuStoreOutput(uint2 *dst,
-                                      const uint8_t *src8,
+inline __device__ void gpuStoreOutput(uint2* dst,
+                                      const uint8_t* src8,
                                       uint32_t dict_pos,
                                       uint32_t dict_size)
 {
@@ -605,10 +605,10 @@ inline __device__ void gpuStoreOutput(uint2 *dst,
   src8 -= ofs;  // align to 32-bit boundary
   ofs <<= 3;    // bytes -> bits
   if (dict_pos < dict_size) {
-    v.x = *reinterpret_cast<const uint32_t *>(src8 + dict_pos + 0);
-    v.y = *reinterpret_cast<const uint32_t *>(src8 + dict_pos + 4);
+    v.x = *reinterpret_cast<const uint32_t*>(src8 + dict_pos + 0);
+    v.y = *reinterpret_cast<const uint32_t*>(src8 + dict_pos + 4);
     if (ofs) {
-      uint32_t next = *reinterpret_cast<const uint32_t *>(src8 + dict_pos + 8);
+      uint32_t next = *reinterpret_cast<const uint32_t*>(src8 + dict_pos + 8);
       v.x           = __funnelshift_r(v.x, v.y, ofs);
       v.y           = __funnelshift_r(v.y, next, ofs);
     }
@@ -625,9 +625,9 @@ inline __device__ void gpuStoreOutput(uint2 *dst,
  * @param[in] src_pos Source position
  * @param[in] dst Pointer to row output data
  */
-inline __device__ void gpuOutputInt96Timestamp(volatile page_state_s *s, int src_pos, int64_t *dst)
+inline __device__ void gpuOutputInt96Timestamp(volatile page_state_s* s, int src_pos, int64_t* dst)
 {
-  const uint8_t *src8;
+  const uint8_t* src8;
   uint32_t dict_pos, dict_size = s->dict_size, ofs;
   int64_t ts;
 
@@ -647,11 +647,11 @@ inline __device__ void gpuOutputInt96Timestamp(volatile page_state_s *s, int src
   if (dict_pos + 4 < dict_size) {
     uint3 v;
     int64_t nanos, secs, days;
-    v.x = *reinterpret_cast<const uint32_t *>(src8 + dict_pos + 0);
-    v.y = *reinterpret_cast<const uint32_t *>(src8 + dict_pos + 4);
-    v.z = *reinterpret_cast<const uint32_t *>(src8 + dict_pos + 8);
+    v.x = *reinterpret_cast<const uint32_t*>(src8 + dict_pos + 0);
+    v.y = *reinterpret_cast<const uint32_t*>(src8 + dict_pos + 4);
+    v.z = *reinterpret_cast<const uint32_t*>(src8 + dict_pos + 8);
     if (ofs) {
-      uint32_t next = *reinterpret_cast<const uint32_t *>(src8 + dict_pos + 12);
+      uint32_t next = *reinterpret_cast<const uint32_t*>(src8 + dict_pos + 12);
       v.x           = __funnelshift_r(v.x, v.y, ofs);
       v.y           = __funnelshift_r(v.y, v.z, ofs);
       v.z           = __funnelshift_r(v.z, next, ofs);
@@ -681,9 +681,9 @@ inline __device__ void gpuOutputInt96Timestamp(volatile page_state_s *s, int src
  * @param[in] src_pos Source position
  * @param[in] dst Pointer to row output data
  */
-inline __device__ void gpuOutputInt64Timestamp(volatile page_state_s *s, int src_pos, int64_t *dst)
+inline __device__ void gpuOutputInt64Timestamp(volatile page_state_s* s, int src_pos, int64_t* dst)
 {
-  const uint8_t *src8;
+  const uint8_t* src8;
   uint32_t dict_pos, dict_size = s->dict_size, ofs;
   int64_t ts;
 
@@ -704,10 +704,10 @@ inline __device__ void gpuOutputInt64Timestamp(volatile page_state_s *s, int src
     uint2 v;
     int64_t val;
     int32_t ts_scale;
-    v.x = *reinterpret_cast<const uint32_t *>(src8 + dict_pos + 0);
-    v.y = *reinterpret_cast<const uint32_t *>(src8 + dict_pos + 4);
+    v.x = *reinterpret_cast<const uint32_t*>(src8 + dict_pos + 0);
+    v.y = *reinterpret_cast<const uint32_t*>(src8 + dict_pos + 4);
     if (ofs) {
-      uint32_t next = *reinterpret_cast<const uint32_t *>(src8 + dict_pos + 8);
+      uint32_t next = *reinterpret_cast<const uint32_t*>(src8 + dict_pos + 8);
       v.x           = __funnelshift_r(v.x, v.y, ofs);
       v.y           = __funnelshift_r(v.y, next, ofs);
     }
@@ -746,12 +746,12 @@ static const __device__ __constant__ double kPow10[40] = {
  * @param[in] dst Pointer to row output data
  * @param[in] dtype Stored data type
  */
-inline __device__ void gpuOutputDecimalAsFloat(volatile page_state_s *s,
+inline __device__ void gpuOutputDecimalAsFloat(volatile page_state_s* s,
                                                int src_pos,
-                                               double *dst,
+                                               double* dst,
                                                int dtype)
 {
-  const uint8_t *dict;
+  const uint8_t* dict;
   uint32_t dict_pos, dict_size = s->dict_size, dtype_len_in;
   int64_t i128_hi, i128_lo;
   int32_t scale;
@@ -823,12 +823,12 @@ inline __device__ void gpuOutputDecimalAsFloat(volatile page_state_s *s,
  * @param[in] src_pos Source position
  * @param[in] dst Pointer to row output data
  */
-inline __device__ void gpuOutputFixedLenByteArrayAsInt64(volatile page_state_s *s,
+inline __device__ void gpuOutputFixedLenByteArrayAsInt64(volatile page_state_s* s,
                                                          int src_pos,
-                                                         int64_t *dst)
+                                                         int64_t* dst)
 {
   uint32_t const dtype_len_in = s->dtype_len_in;
-  uint8_t const *data         = s->dict_base ? s->dict_base : s->data_start;
+  uint8_t const* data         = s->dict_base ? s->dict_base : s->data_start;
   uint32_t const pos =
     (s->dict_base ? ((s->dict_bits > 0) ? s->dict_idx[src_pos & (non_zero_buffer_size - 1)] : 0)
                   : src_pos) *
@@ -857,9 +857,9 @@ inline __device__ void gpuOutputFixedLenByteArrayAsInt64(volatile page_state_s *
  * @param[in] dst Pointer to row output data
  */
 template <typename T>
-inline __device__ void gpuOutputFast(volatile page_state_s *s, int src_pos, T *dst)
+inline __device__ void gpuOutputFast(volatile page_state_s* s, int src_pos, T* dst)
 {
-  const uint8_t *dict;
+  const uint8_t* dict;
   uint32_t dict_pos, dict_size = s->dict_size;
 
   if (s->dict_base) {
@@ -883,12 +883,12 @@ inline __device__ void gpuOutputFast(volatile page_state_s *s, int src_pos, T *d
  * @param[in] dst8 Pointer to row output data
  * @param[in] len Length of element
  */
-static __device__ void gpuOutputGeneric(volatile page_state_s *s,
+static __device__ void gpuOutputGeneric(volatile page_state_s* s,
                                         int src_pos,
-                                        uint8_t *dst8,
+                                        uint8_t* dst8,
                                         int len)
 {
-  const uint8_t *dict;
+  const uint8_t* dict;
   uint32_t dict_pos, dict_size = s->dict_size;
 
   if (s->dict_base) {
@@ -908,23 +908,23 @@ static __device__ void gpuOutputGeneric(volatile page_state_s *s,
     }
   } else {
     // Copy 4 bytes at a time
-    const uint8_t *src8 = dict;
+    const uint8_t* src8 = dict;
     unsigned int ofs    = 3 & reinterpret_cast<size_t>(src8);
     src8 -= ofs;  // align to 32-bit boundary
     ofs <<= 3;    // bytes -> bits
     for (unsigned int i = 0; i < len; i += 4) {
       uint32_t bytebuf;
       if (dict_pos < dict_size) {
-        bytebuf = *reinterpret_cast<const uint32_t *>(src8 + dict_pos);
+        bytebuf = *reinterpret_cast<const uint32_t*>(src8 + dict_pos);
         if (ofs) {
-          uint32_t bytebufnext = *reinterpret_cast<const uint32_t *>(src8 + dict_pos + 4);
+          uint32_t bytebufnext = *reinterpret_cast<const uint32_t*>(src8 + dict_pos + 4);
           bytebuf              = __funnelshift_r(bytebuf, bytebufnext, ofs);
         }
       } else {
         bytebuf = 0;
       }
       dict_pos += 4;
-      *reinterpret_cast<uint32_t *>(dst8 + i) = bytebuf;
+      *reinterpret_cast<uint32_t*>(dst8 + i) = bytebuf;
     }
   }
 }
@@ -939,9 +939,9 @@ static __device__ void gpuOutputGeneric(volatile page_state_s *s,
  * @param[in] min_row crop all rows below min_row
  * @param[in] num_chunk Number of column chunks
  */
-static __device__ bool setupLocalPageInfo(page_state_s *const s,
-                                          PageInfo *p,
-                                          ColumnChunkDesc const *chunks,
+static __device__ bool setupLocalPageInfo(page_state_s* const s,
+                                          PageInfo* p,
+                                          ColumnChunkDesc const* chunks,
                                           size_t min_row,
                                           size_t num_rows,
                                           int32_t num_chunks)
@@ -984,8 +984,8 @@ static __device__ bool setupLocalPageInfo(page_state_s *const s,
     // - On page N, the remaining 4/6 values are encoded, but there are no new rows.
     // if (s->page.num_input_values > 0 && s->page.num_rows > 0) {
     if (s->page.num_input_values > 0) {
-      uint8_t *cur = s->page.page_data;
-      uint8_t *end = cur + s->page.uncompressed_page_size;
+      uint8_t* cur = s->page.page_data;
+      uint8_t* end = cur + s->page.uncompressed_page_size;
 
       uint32_t dtype_len_out = s->col.data_type >> 3;
       s->ts_scale            = 0;
@@ -1052,7 +1052,7 @@ static __device__ bool setupLocalPageInfo(page_state_s *const s,
       if (s->col.column_data_base != nullptr) {
         int max_depth = s->col.max_nesting_depth;
         for (int idx = 0; idx < max_depth; idx++) {
-          PageNestingInfo *pni = &s->page.nesting[idx];
+          PageNestingInfo* pni = &s->page.nesting[idx];
 
           size_t output_offset;
           // schemas without lists
@@ -1064,7 +1064,7 @@ static __device__ bool setupLocalPageInfo(page_state_s *const s,
             output_offset = pni->page_start_value;
           }
 
-          pni->data_out = static_cast<uint8_t *>(s->col.column_data_base[idx]);
+          pni->data_out = static_cast<uint8_t*>(s->col.column_data_base[idx]);
           if (pni->data_out != nullptr) {
             // anything below max depth with a valid data pointer must be a list, so the
             // element size is the size of the offset type.
@@ -1094,7 +1094,7 @@ static __device__ bool setupLocalPageInfo(page_state_s *const s,
           // RLE-packed dictionary indices, first byte indicates index length in bits
           if (((s->col.data_type & 7) == BYTE_ARRAY) && (s->col.str_dict_index)) {
             // String dictionary: use index
-            s->dict_base = reinterpret_cast<const uint8_t *>(s->col.str_dict_index);
+            s->dict_base = reinterpret_cast<const uint8_t*>(s->col.str_dict_index);
             s->dict_size = s->col.page_info[0].num_input_values * sizeof(string_index_pair);
           } else {
             s->dict_base =
@@ -1195,7 +1195,7 @@ static __device__ bool setupLocalPageInfo(page_state_s *const s,
  * @param[in] valid_mask The validity mask to be stored
  * @param[in] value_count # of bits in the validity mask
  */
-static __device__ void store_validity(PageNestingInfo *pni,
+static __device__ void store_validity(PageNestingInfo* pni,
                                       uint32_t valid_mask,
                                       int32_t value_count)
 {
@@ -1249,10 +1249,10 @@ static __device__ void store_validity(PageNestingInfo *pni,
  * @param[in] target_input_value_count The desired # of input level values we want to process
  * @param[in] t Thread index
  */
-inline __device__ void get_nesting_bounds(int &start_depth,
-                                          int &end_depth,
-                                          int &d,
-                                          page_state_s *s,
+inline __device__ void get_nesting_bounds(int& start_depth,
+                                          int& end_depth,
+                                          int& d,
+                                          page_state_s* s,
                                           int input_value_count,
                                           int32_t target_input_value_count,
                                           int t)
@@ -1288,7 +1288,7 @@ inline __device__ void get_nesting_bounds(int &start_depth,
  * @param[in] t Thread index
  */
 static __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_input_value_count,
-                                                             page_state_s *s,
+                                                             page_state_s* s,
                                                              int t)
 {
   // max nesting depth of the column
@@ -1339,7 +1339,7 @@ static __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_inpu
     // walk from 0 to max_depth
     uint32_t next_thread_value_count, next_warp_value_count;
     for (int s_idx = 0; s_idx < max_depth; s_idx++) {
-      PageNestingInfo *pni = &s->page.nesting[s_idx];
+      PageNestingInfo* pni = &s->page.nesting[s_idx];
 
       // if we are within the range of nesting levels we should be adding value indices for
       int const in_nesting_bounds =
@@ -1391,7 +1391,7 @@ static __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_inpu
           cudf::size_type const ofs = s->page.nesting[s_idx + 1].value_count +
                                       next_thread_value_count +
                                       s->page.nesting[s_idx + 1].page_start_value;
-          (reinterpret_cast<cudf::size_type *>(pni->data_out))[idx] = ofs;
+          (reinterpret_cast<cudf::size_type*>(pni->data_out))[idx] = ofs;
         }
       }
 
@@ -1455,7 +1455,7 @@ static __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_inpu
  * @param[in] target_leaf_count Target count of non-null leaf values to generate indices for
  * @param[in] t Thread index
  */
-__device__ void gpuDecodeLevels(page_state_s *s, int32_t target_leaf_count, int t)
+__device__ void gpuDecodeLevels(page_state_s* s, int32_t target_leaf_count, int t)
 {
   bool has_repetition = s->col.max_level[level_type::REPETITION] > 0;
 
@@ -1467,7 +1467,7 @@ __device__ void gpuDecodeLevels(page_state_s *s, int32_t target_leaf_count, int
     gpuDecodeStream(s->def, s, cur_leaf_count, t, level_type::DEFINITION);
     __syncwarp();
 
-    // because the rep and def streams are encoded seperately, we cannot request an exact
+    // because the rep and def streams are encoded separately, we cannot request an exact
     // # of values to be decoded at once. we can only process the lowest # of decoded rep/def
     // levels we get.
     int actual_leaf_count = has_repetition ? min(s->lvl_count[level_type::REPETITION],
@@ -1494,7 +1494,7 @@ __device__ void gpuDecodeLevels(page_state_s *s, int32_t target_leaf_count, int
  * @param[in] bounds_set Whether or not s->row_index_lower_bound, s->first_row and s->num_rows
  * have been computed for this page (they will only be set in the second/trim pass).
  */
-static __device__ void gpuUpdatePageSizes(page_state_s *s,
+static __device__ void gpuUpdatePageSizes(page_state_s* s,
                                           int32_t target_input_value_count,
                                           int t,
                                           bool bounds_set)
@@ -1586,8 +1586,8 @@ static __device__ void gpuUpdatePageSizes(page_state_s *s,
  */
 // blockDim {block_size,1,1}
 extern "C" __global__ void __launch_bounds__(block_size)
-  gpuComputePageSizes(PageInfo *pages,
-                      ColumnChunkDesc const *chunks,
+  gpuComputePageSizes(PageInfo* pages,
+                      ColumnChunkDesc const* chunks,
                       size_t min_row,
                       size_t num_rows,
                       int32_t num_chunks,
@@ -1595,10 +1595,10 @@ extern "C" __global__ void __launch_bounds__(block_size)
 {
   __shared__ __align__(16) page_state_s state_g;
 
-  page_state_s *const s = &state_g;
+  page_state_s* const s = &state_g;
   int page_idx          = blockIdx.x;
   int t                 = threadIdx.x;
-  PageInfo *pp          = &pages[page_idx];
+  PageInfo* pp          = &pages[page_idx];
 
   if (!setupLocalPageInfo(
         s, pp, chunks, trim_pass ? min_row : 0, trim_pass ? num_rows : INT_MAX, num_chunks)) {
@@ -1678,15 +1678,15 @@ extern "C" __global__ void __launch_bounds__(block_size)
  */
 // blockDim {block_size,1,1}
 extern "C" __global__ void __launch_bounds__(block_size)
-  gpuDecodePageData(PageInfo *pages,
-                    ColumnChunkDesc const *chunks,
+  gpuDecodePageData(PageInfo* pages,
+                    ColumnChunkDesc const* chunks,
                     size_t min_row,
                     size_t num_rows,
                     int32_t num_chunks)
 {
   __shared__ __align__(16) page_state_s state_g;
 
-  page_state_s *const s = &state_g;
+  page_state_s* const s = &state_g;
   int page_idx          = blockIdx.x;
   int t                 = threadIdx.x;
   int out_thread0;
@@ -1732,7 +1732,7 @@ extern "C" __global__ void __launch_bounds__(block_size)
       } else if ((s->col.data_type & 7) == BYTE_ARRAY) {
         gpuInitStringDescriptors(s, src_target_pos, t & 0x1f);
       }
-      if (t == 32) { *(volatile int32_t *)&s->dict_pos = src_target_pos; }
+      if (t == 32) { *(volatile int32_t*)&s->dict_pos = src_target_pos; }
     } else {
       // WARP1..WARP3: Decode values
       int dtype = s->col.data_type & 7;
@@ -1767,52 +1767,52 @@ extern "C" __global__ void __launch_bounds__(block_size)
         int leaf_level_index = s->col.max_nesting_depth - 1;
 
         uint32_t dtype_len = s->dtype_len;
-        void *dst =
+        void* dst =
           s->page.nesting[leaf_level_index].data_out + static_cast<size_t>(dst_pos) * dtype_len;
         if (dtype == BYTE_ARRAY) {
           gpuOutputString(s, val_src_pos, dst);
         } else if (dtype == BOOLEAN) {
-          gpuOutputBoolean(s, val_src_pos, static_cast<uint8_t *>(dst));
+          gpuOutputBoolean(s, val_src_pos, static_cast<uint8_t*>(dst));
         } else if (s->col.converted_type == DECIMAL) {
           switch (dtype) {
-            case INT32: gpuOutputFast(s, val_src_pos, static_cast<uint32_t *>(dst)); break;
-            case INT64: gpuOutputFast(s, val_src_pos, static_cast<uint2 *>(dst)); break;
+            case INT32: gpuOutputFast(s, val_src_pos, static_cast<uint32_t*>(dst)); break;
+            case INT64: gpuOutputFast(s, val_src_pos, static_cast<uint2*>(dst)); break;
             default:
               // we currently do not support reading byte arrays larger than DECIMAL64
               if (s->dtype_len_in <= 8) {
-                gpuOutputFixedLenByteArrayAsInt64(s, val_src_pos, static_cast<int64_t *>(dst));
+                gpuOutputFixedLenByteArrayAsInt64(s, val_src_pos, static_cast<int64_t*>(dst));
               } else {
-                gpuOutputDecimalAsFloat(s, val_src_pos, static_cast<double *>(dst), dtype);
+                gpuOutputDecimalAsFloat(s, val_src_pos, static_cast<double*>(dst), dtype);
               }
               break;
           }
         } else if (dtype == INT96) {
-          gpuOutputInt96Timestamp(s, val_src_pos, static_cast<int64_t *>(dst));
+          gpuOutputInt96Timestamp(s, val_src_pos, static_cast<int64_t*>(dst));
         } else if (dtype_len == 8) {
           if (s->ts_scale) {
-            gpuOutputInt64Timestamp(s, val_src_pos, static_cast<int64_t *>(dst));
+            gpuOutputInt64Timestamp(s, val_src_pos, static_cast<int64_t*>(dst));
           } else {
-            gpuOutputFast(s, val_src_pos, static_cast<uint2 *>(dst));
+            gpuOutputFast(s, val_src_pos, static_cast<uint2*>(dst));
           }
         } else if (dtype_len == 4) {
-          gpuOutputFast(s, val_src_pos, static_cast<uint32_t *>(dst));
+          gpuOutputFast(s, val_src_pos, static_cast<uint32_t*>(dst));
         } else {
-          gpuOutputGeneric(s, val_src_pos, static_cast<uint8_t *>(dst), dtype_len);
+          gpuOutputGeneric(s, val_src_pos, static_cast<uint8_t*>(dst), dtype_len);
         }
       }
 
-      if (t == out_thread0) { *(volatile int32_t *)&s->src_pos = target_pos; }
+      if (t == out_thread0) { *(volatile int32_t*)&s->src_pos = target_pos; }
     }
     __syncthreads();
   }
 }
 
 struct chunk_row_output_iter {
-  PageInfo *p;
+  PageInfo* p;
   using value_type        = size_type;
   using difference_type   = size_type;
-  using pointer           = size_type *;
-  using reference         = size_type &;
+  using pointer           = size_type*;
+  using reference         = size_type&;
   using iterator_category = thrust::output_device_iterator_tag;
 
   __host__ __device__ chunk_row_output_iter operator+(int i)
@@ -1828,16 +1828,16 @@ struct chunk_row_output_iter {
 };
 
 struct start_offset_output_iterator {
-  PageInfo *pages;
-  int *page_indices;
+  PageInfo* pages;
+  int* page_indices;
   int cur_index;
   int src_col_schema;
   int nesting_depth;
   int empty               = 0;
   using value_type        = size_type;
   using difference_type   = size_type;
-  using pointer           = size_type *;
-  using reference         = size_type &;
+  using pointer           = size_type*;
+  using reference         = size_type&;
   using iterator_category = thrust::output_device_iterator_tag;
 
   __host__ __device__ start_offset_output_iterator operator+(int i)
@@ -1854,7 +1854,7 @@ struct start_offset_output_iterator {
  private:
   __device__ reference dereference(int index)
   {
-    PageInfo const &p = pages[page_indices[index]];
+    PageInfo const& p = pages[page_indices[index]];
     if (p.src_col_schema != src_col_schema || p.flags & PAGEINFO_FLAGS_DICTIONARY) { return empty; }
     return p.nesting[nesting_depth].page_start_value;
   }
@@ -1863,14 +1863,14 @@ struct start_offset_output_iterator {
 /**
  * @copydoc cudf::io::parquet::gpu::PreprocessColumnData
  */
-void PreprocessColumnData(hostdevice_vector<PageInfo> &pages,
-                          hostdevice_vector<ColumnChunkDesc> const &chunks,
-                          std::vector<input_column_info> &input_columns,
-                          std::vector<cudf::io::detail::column_buffer> &output_columns,
+void PreprocessColumnData(hostdevice_vector<PageInfo>& pages,
+                          hostdevice_vector<ColumnChunkDesc> const& chunks,
+                          std::vector<input_column_info>& input_columns,
+                          std::vector<cudf::io::detail::column_buffer>& output_columns,
                           size_t num_rows,
                           size_t min_row,
                           rmm::cuda_stream_view stream,
-                          rmm::mr::device_memory_resource *mr)
+                          rmm::mr::device_memory_resource* mr)
 {
   dim3 dim_block(block_size, 1);
   dim3 dim_grid(pages.size(), 1);  // 1 threadblock per page
@@ -1885,9 +1885,9 @@ void PreprocessColumnData(hostdevice_vector<PageInfo> &pages,
   // computes:
   // PageInfo::chunk_row for all pages
   auto key_input = thrust::make_transform_iterator(
-    pages.device_ptr(), [] __device__(PageInfo const &page) { return page.chunk_idx; });
+    pages.device_ptr(), [] __device__(PageInfo const& page) { return page.chunk_idx; });
   auto page_input = thrust::make_transform_iterator(
-    pages.device_ptr(), [] __device__(PageInfo const &page) { return page.num_rows; });
+    pages.device_ptr(), [] __device__(PageInfo const& page) { return page.num_rows; });
   thrust::exclusive_scan_by_key(rmm::exec_policy(stream),
                                 key_input,
                                 key_input + pages.size(),
@@ -1927,7 +1927,7 @@ void PreprocessColumnData(hostdevice_vector<PageInfo> &pages,
                       pages.device_ptr(),
                       pages.device_ptr() + pages.size(),
                       page_keys.begin(),
-                      [] __device__(PageInfo const &page) { return page.src_col_schema; });
+                      [] __device__(PageInfo const& page) { return page.src_col_schema; });
 
     thrust::sequence(rmm::exec_policy(stream), page_index.begin(), page_index.end());
     thrust::stable_sort_by_key(rmm::exec_policy(stream),
@@ -1939,20 +1939,20 @@ void PreprocessColumnData(hostdevice_vector<PageInfo> &pages,
 
   // compute output column sizes by examining the pages of the -input- columns
   for (size_t idx = 0; idx < input_columns.size(); idx++) {
-    auto const &input_col = input_columns[idx];
+    auto const& input_col = input_columns[idx];
     auto src_col_schema   = input_col.schema_idx;
     size_t max_depth      = input_col.nesting_depth();
 
-    auto *cols = &output_columns;
+    auto* cols = &output_columns;
     for (size_t l_idx = 0; l_idx < input_col.nesting_depth(); l_idx++) {
-      auto &out_buf = (*cols)[input_col.nesting[l_idx]];
+      auto& out_buf = (*cols)[input_col.nesting[l_idx]];
       cols          = &out_buf.children;
 
       // size iterator. indexes pages by sorted order
       auto size_input = thrust::make_transform_iterator(
         page_index.begin(),
         [src_col_schema, l_idx, pages = pages.device_ptr()] __device__(int index) {
-          auto const &page = pages[index];
+          auto const& page = pages[index];
           if (page.src_col_schema != src_col_schema || page.flags & PAGEINFO_FLAGS_DICTIONARY) {
             return 0;
           }
@@ -1989,8 +1989,8 @@ void PreprocessColumnData(hostdevice_vector<PageInfo> &pages,
 /**
  * @copydoc cudf::io::parquet::gpu::DecodePageData
  */
-void __host__ DecodePageData(hostdevice_vector<PageInfo> &pages,
-                             hostdevice_vector<ColumnChunkDesc> const &chunks,
+void __host__ DecodePageData(hostdevice_vector<PageInfo>& pages,
+                             hostdevice_vector<ColumnChunkDesc> const& chunks,
                              size_t num_rows,
                              size_t min_row,
                              rmm::cuda_stream_view stream)
diff --git a/cpp/src/io/parquet/page_dict.cu b/cpp/src/io/parquet/page_dict.cu
index 2d505b99981..0c55828b120 100644
--- a/cpp/src/io/parquet/page_dict.cu
+++ b/cpp/src/io/parquet/page_dict.cu
@@ -29,8 +29,8 @@ namespace parquet {
 namespace gpu {
 struct dict_state_s {
   uint32_t row_cnt;
-  PageFragment *cur_fragment;
-  uint32_t *hashmap;
+  PageFragment* cur_fragment;
+  uint32_t* hashmap;
   uint32_t total_dict_entries;  //!< Total number of entries in dictionary
   uint32_t dictionary_size;     //!< Total dictionary size in bytes
   uint32_t num_dict_entries;    //!< Dictionary entries in current fragment to add
@@ -52,14 +52,14 @@ inline __device__ uint32_t uint64_hash16(uint64_t v)
   return uint32_hash16((uint32_t)(v + (v >> 32)));
 }
 
-inline __device__ uint32_t hash_string(const string_view &val)
+inline __device__ uint32_t hash_string(const string_view& val)
 {
-  const char *p = val.data();
+  const char* p = val.data();
   uint32_t len  = val.size_bytes();
   uint32_t hash = len;
   if (len > 0) {
     uint32_t align_p    = 3 & reinterpret_cast<uintptr_t>(p);
-    const uint32_t *p32 = reinterpret_cast<const uint32_t *>(p - align_p);
+    const uint32_t* p32 = reinterpret_cast<const uint32_t*>(p - align_p);
     uint32_t ofs        = align_p * 8;
     uint32_t v;
     while (len > 4) {
@@ -85,8 +85,8 @@ inline __device__ uint32_t hash_string(const string_view &val)
  * @param[in] frag_start_row row position of current fragment
  * @param[in] t thread id
  */
-__device__ void FetchDictionaryFragment(dict_state_s *s,
-                                        uint32_t *dict_data,
+__device__ void FetchDictionaryFragment(dict_state_s* s,
+                                        uint32_t* dict_data,
                                         uint32_t frag_start_row,
                                         uint32_t t)
 {
@@ -108,12 +108,12 @@ __device__ void FetchDictionaryFragment(dict_state_s *s,
 
 /// Generate dictionary indices in ascending row order
 template <int block_size>
-__device__ void GenerateDictionaryIndices(dict_state_s *s, uint32_t t)
+__device__ void GenerateDictionaryIndices(dict_state_s* s, uint32_t t)
 {
   using block_scan = cub::BlockScan<uint32_t, block_size>;
   __shared__ typename block_scan::TempStorage temp_storage;
-  uint32_t *dict_index      = s->col.dict_index;
-  uint32_t *dict_data       = s->col.dict_data + s->ck.start_row;
+  uint32_t* dict_index      = s->col.dict_index;
+  uint32_t* dict_data       = s->col.dict_data + s->ck.start_row;
   uint32_t num_dict_entries = 0;
 
   for (uint32_t i = 0; i < s->row_cnt; i += 1024) {
@@ -150,13 +150,13 @@ __device__ void GenerateDictionaryIndices(dict_state_s *s, uint32_t t)
 // blockDim(1024, 1, 1)
 template <int block_size>
 __global__ void __launch_bounds__(block_size, 1)
-  gpuBuildChunkDictionaries(device_span<EncColumnChunk> chunks, uint32_t *dev_scratch)
+  gpuBuildChunkDictionaries(device_span<EncColumnChunk> chunks, uint32_t* dev_scratch)
 {
   __shared__ __align__(8) dict_state_s state_g;
   using block_reduce = cub::BlockReduce<uint32_t, block_size>;
   __shared__ typename block_reduce::TempStorage temp_storage;
 
-  dict_state_s *const s = &state_g;
+  dict_state_s* const s = &state_g;
   uint32_t t            = threadIdx.x;
   uint32_t dtype, dtype_len, dtype_len_in;
 
@@ -227,23 +227,19 @@ __global__ void __launch_bounds__(block_size, 1)
             val  = s->col.leaf_column->element<uint64_t>(row);
             hash = uint64_hash16(val);
           } else {
-            val = (dtype_len_in == 4)
-                    ? s->col.leaf_column->element<uint32_t>(row)
-                    : (dtype_len_in == 2) ? s->col.leaf_column->element<uint16_t>(row)
-                                          : s->col.leaf_column->element<uint8_t>(row);
+            val  = (dtype_len_in == 4)   ? s->col.leaf_column->element<uint32_t>(row)
+                   : (dtype_len_in == 2) ? s->col.leaf_column->element<uint16_t>(row)
+                                         : s->col.leaf_column->element<uint8_t>(row);
             hash = uint32_hash16(val);
           }
           // Walk the list of rows with the same hash
           next_addr = &s->hashmap[hash];
           while ((next = atomicCAS(next_addr, 0, row + 1)) != 0) {
             auto const current = next - 1;
-            uint64_t val2      = (dtype_len_in == 8)
-                              ? s->col.leaf_column->element<uint64_t>(current)
-                              : (dtype_len_in == 4)
-                                  ? s->col.leaf_column->element<uint32_t>(current)
-                                  : (dtype_len_in == 2)
-                                      ? s->col.leaf_column->element<uint16_t>(current)
-                                      : s->col.leaf_column->element<uint8_t>(current);
+            uint64_t val2 = (dtype_len_in == 8)   ? s->col.leaf_column->element<uint64_t>(current)
+                            : (dtype_len_in == 4) ? s->col.leaf_column->element<uint32_t>(current)
+                            : (dtype_len_in == 2) ? s->col.leaf_column->element<uint16_t>(current)
+                                                  : s->col.leaf_column->element<uint8_t>(current);
             if (val2 == val) {
               is_dupe = 1;
               break;
@@ -274,7 +270,9 @@ __global__ void __launch_bounds__(block_size, 1)
       bool reorder_check = (is_valid && is_dupe && next - 1 > row);
       if (reorder_check) {
         next = s->col.dict_index[next - 1];
-        while (next & (1u << 31)) { next = s->col.dict_index[next & 0x7fffffff]; }
+        while (next & (1u << 31)) {
+          next = s->col.dict_index[next & 0x7fffffff];
+        }
       }
       if (__syncthreads_or(reorder_check)) {
         if (reorder_check) { atomicMin(&s->col.dict_index[next], row); }
@@ -324,7 +322,7 @@ __global__ void __launch_bounds__(block_size, 1)
  * @param[in] stream CUDA stream to use, default 0
  */
 void BuildChunkDictionaries(device_span<EncColumnChunk> chunks,
-                            uint32_t *dev_scratch,
+                            uint32_t* dev_scratch,
                             rmm::cuda_stream_view stream)
 {
   auto num_chunks = chunks.size();
diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu
index bf9114949aa..3c62dcf7eea 100644
--- a/cpp/src/io/parquet/page_enc.cu
+++ b/cpp/src/io/parquet/page_enc.cu
@@ -59,8 +59,8 @@ struct frag_init_state_s {
 };
 
 struct page_enc_state_s {
-  uint8_t *cur;          //!< current output ptr
-  uint8_t *rle_out;      //!< current RLE write ptr
+  uint8_t* cur;          //!< current output ptr
+  uint8_t* rle_out;      //!< current RLE write ptr
   uint32_t rle_run;      //!< current RLE run
   uint32_t run_val;      //!< current RLE run value
   uint32_t rle_pos;      //!< RLE encoder positions
@@ -81,9 +81,9 @@ struct page_enc_state_s {
 /**
  * @brief Return a 12-bit hash from a byte sequence
  */
-inline __device__ uint32_t hash_string(const string_view &val)
+inline __device__ uint32_t hash_string(const string_view& val)
 {
-  char const *ptr = val.data();
+  char const* ptr = val.data();
   uint32_t len    = val.size_bytes();
   if (len != 0) {
     return (ptr[0] + (ptr[len - 1] << 5) + (len << 10)) & ((1 << init_hash_bits) - 1);
@@ -130,7 +130,7 @@ __global__ void __launch_bounds__(block_size)
     typename block_scan::TempStorage scan_storage;
   } temp_storage;
 
-  frag_init_state_s *const s = &state_g;
+  frag_init_state_s* const s = &state_g;
   uint32_t t                 = threadIdx.x;
   uint32_t start_row, dtype_len, dtype_len_in, dtype;
 
@@ -190,9 +190,11 @@ __global__ void __launch_bounds__(block_size)
       s->frag.num_values = s->frag.num_rows;
     }
   }
-  dtype = s->col.physical_type;
-  dtype_len =
-    (dtype == INT96) ? 12 : (dtype == INT64 || dtype == DOUBLE) ? 8 : (dtype == BOOLEAN) ? 1 : 4;
+  dtype     = s->col.physical_type;
+  dtype_len = (dtype == INT96)                      ? 12
+              : (dtype == INT64 || dtype == DOUBLE) ? 8
+              : (dtype == BOOLEAN)                  ? 1
+                                                    : 4;
   if (dtype == INT32) {
     dtype_len_in = GetDtypeLogicalLen(s->col.leaf_column);
   } else if (dtype == INT96) {
@@ -224,11 +226,10 @@ __global__ void __launch_bounds__(block_size)
         } else if (dtype_len_in == 8) {
           hash = uint64_init_hash(s->col.leaf_column->element<uint64_t>(val_idx));
         } else {
-          hash = uint32_init_hash((dtype_len_in == 4)
-                                    ? s->col.leaf_column->element<uint32_t>(val_idx)
-                                    : (dtype_len_in == 2)
-                                        ? s->col.leaf_column->element<uint16_t>(val_idx)
-                                        : s->col.leaf_column->element<uint8_t>(val_idx));
+          hash =
+            uint32_init_hash((dtype_len_in == 4)   ? s->col.leaf_column->element<uint32_t>(val_idx)
+                             : (dtype_len_in == 2) ? s->col.leaf_column->element<uint16_t>(val_idx)
+                                                   : s->col.leaf_column->element<uint8_t>(val_idx));
         }
       }
     } else {
@@ -246,7 +247,7 @@ __global__ void __launch_bounds__(block_size)
     }
     __syncthreads();
     if (is_valid && dtype != BOOLEAN) {
-      uint32_t *dict_index = s->col.dict_index;
+      uint32_t* dict_index = s->col.dict_index;
       if (dict_index) {
         atomicAdd(&s->map.u32[hash >> 1], (hash & 1) ? 1 << 16 : 1);
         dict_index[start_value_idx + nz_pos] =
@@ -283,7 +284,7 @@ __global__ void __launch_bounds__(block_size)
   __syncthreads();
   // Put the indices back in hash order
   if (s->col.dict_index) {
-    uint32_t *dict_index = s->col.dict_index + start_row;
+    uint32_t* dict_index = s->col.dict_index + start_row;
     uint32_t nnz         = s->frag.non_nulls;
     for (uint32_t i = 0; i < nnz; i += block_size) {
       uint32_t pos = 0, hash = 0, pos_old, pos_new, sh, colliding_row, val = 0;
@@ -393,7 +394,7 @@ __global__ void __launch_bounds__(128)
   uint32_t frag_id              = blockIdx.y * 4 + (threadIdx.x >> 5);
   uint32_t column_id            = blockIdx.x;
   auto num_fragments_per_column = fragments.size().second;
-  statistics_group *const g     = &group_g[threadIdx.x >> 5];
+  statistics_group* const g     = &group_g[threadIdx.x >> 5];
   if (!lane_id && frag_id < num_fragments_per_column) {
     g->col       = &col_desc[column_id];
     g->start_row = fragments[column_id][frag_id].start_value_idx;
@@ -408,8 +409,8 @@ __global__ void __launch_bounds__(128)
   gpuInitPages(device_2dspan<EncColumnChunk> chunks,
                device_span<gpu::EncPage> pages,
                device_span<parquet_column_device_view const> col_desc,
-               statistics_merge_group *page_grstats,
-               statistics_merge_group *chunk_grstats,
+               statistics_merge_group* page_grstats,
+               statistics_merge_group* chunk_grstats,
                int32_t num_columns)
 {
   // TODO: All writing seems to be done by thread 0. Could be replaced by thrust foreach
@@ -502,9 +503,9 @@ __global__ void __launch_bounds__(128)
         fragment_data_size = frag_g.fragment_data_size;
       }
       // TODO (dm): this convoluted logic to limit page size needs refactoring
-      max_page_size = (values_in_page * 2 >= ck_g.num_values)
-                        ? 256 * 1024
-                        : (values_in_page * 3 >= ck_g.num_values) ? 384 * 1024 : 512 * 1024;
+      max_page_size = (values_in_page * 2 >= ck_g.num_values)   ? 256 * 1024
+                      : (values_in_page * 3 >= ck_g.num_values) ? 384 * 1024
+                                                                : 512 * 1024;
       if (num_rows >= ck_g.num_rows ||
           (values_in_page > 0 &&
            (page_size + fragment_data_size > max_page_size ||
@@ -632,7 +633,7 @@ static __device__ __constant__ uint32_t kRleRunMask[16] = {
 /**
  * @brief Variable-length encode an integer
  */
-inline __device__ uint8_t *VlqEncode(uint8_t *p, uint32_t v)
+inline __device__ uint8_t* VlqEncode(uint8_t* p, uint32_t v)
 {
   while (v > 0x7f) {
     *p++ = (v | 0x80);
@@ -646,7 +647,7 @@ inline __device__ uint8_t *VlqEncode(uint8_t *p, uint32_t v)
  * @brief Pack literal values in output bitstream (1,2,4,8,12 or 16 bits per value)
  */
 inline __device__ void PackLiterals(
-  uint8_t *dst, uint32_t v, uint32_t count, uint32_t w, uint32_t t)
+  uint8_t* dst, uint32_t v, uint32_t count, uint32_t w, uint32_t t)
 {
   if (w == 1 || w == 2 || w == 4 || w == 8 || w == 12 || w == 16) {
     if (t <= (count | 0x1f)) {
@@ -713,7 +714,7 @@ inline __device__ void PackLiterals(
     // Copy scratch data to final destination
     auto available_bytes = (count * w + 7) / 8;
 
-    auto scratch_bytes = reinterpret_cast<char *>(&scratch[0]);
+    auto scratch_bytes = reinterpret_cast<char*>(&scratch[0]);
     if (t < available_bytes) { dst[t] = scratch_bytes[t]; }
     if (t + 128 < available_bytes) { dst[t + 128] = scratch_bytes[t + 128]; }
     __syncthreads();
@@ -730,7 +731,7 @@ inline __device__ void PackLiterals(
  * @param[in] t thread id (0..127)
  */
 static __device__ void RleEncode(
-  page_enc_state_s *s, uint32_t numvals, uint32_t nbits, uint32_t flush, uint32_t t)
+  page_enc_state_s* s, uint32_t numvals, uint32_t nbits, uint32_t flush, uint32_t t)
 {
   uint32_t rle_pos = s->rle_pos;
   uint32_t rle_run = s->rle_run;
@@ -759,7 +760,7 @@ static __device__ void RleEncode(
       if (rle_rpt_count < max_rpt_count || (flush && rle_pos == numvals)) {
         if (t == 0) {
           uint32_t const run_val = s->run_val;
-          uint8_t *dst           = VlqEncode(s->rle_out, rle_run);
+          uint8_t* dst           = VlqEncode(s->rle_out, rle_run);
           *dst++                 = run_val;
           if (nbits > 8) { *dst++ = run_val >> 8; }
           s->rle_out = dst;
@@ -823,7 +824,7 @@ static __device__ void RleEncode(
             rle_rpt_count = 0;                      // Defer repeat run
           }
           if (lit_div8 != 0) {
-            uint8_t *dst = s->rle_out + 1 + (rle_run >> 1) * nbits;
+            uint8_t* dst = s->rle_out + 1 + (rle_run >> 1) * nbits;
             PackLiterals(dst, (rle_pos + t < numvals) ? v0 : 0, lit_div8 * 8, nbits, t);
             rle_run = (rle_run + lit_div8 * 2) | 1;
             rle_pos = min(rle_pos + lit_div8 * 8, numvals);
@@ -833,7 +834,7 @@ static __device__ void RleEncode(
           __syncthreads();
           // Complete literal run
           if (!t) {
-            uint8_t *dst = s->rle_out;
+            uint8_t* dst = s->rle_out;
             dst[0]       = rle_run;  // At most 0x7f
             dst += 1 + nbits * (rle_run >> 1);
             s->rle_out = dst;
@@ -868,13 +869,13 @@ static __device__ void RleEncode(
  * @param[in] flush nonzero if last batch in block
  * @param[in] t thread id (0..127)
  */
-static __device__ void PlainBoolEncode(page_enc_state_s *s,
+static __device__ void PlainBoolEncode(page_enc_state_s* s,
                                        uint32_t numvals,
                                        uint32_t flush,
                                        uint32_t t)
 {
   uint32_t rle_pos = s->rle_pos;
-  uint8_t *dst     = s->rle_out;
+  uint8_t* dst     = s->rle_out;
 
   while (rle_pos < numvals) {
     uint32_t pos    = rle_pos + t;
@@ -935,7 +936,7 @@ __global__ void __launch_bounds__(128, 8)
   using block_scan = cub::BlockScan<uint32_t, block_size>;
   __shared__ typename block_scan::TempStorage temp_storage;
 
-  page_enc_state_s *const s = &state_g;
+  page_enc_state_s* const s = &state_g;
   uint32_t t                = threadIdx.x;
   uint32_t dtype, dtype_len_in, dtype_len_out;
   int32_t dict_bits;
@@ -1002,8 +1003,8 @@ __global__ void __launch_bounds__(128, 8)
         __syncthreads();
       }
       if (t < 32) {
-        uint8_t *cur     = s->cur;
-        uint8_t *rle_out = s->rle_out;
+        uint8_t* cur     = s->cur;
+        uint8_t* rle_out = s->rle_out;
         if (t < 4) {
           uint32_t rle_bytes = (uint32_t)(rle_out - cur) - 4;
           cur[t]             = rle_bytes >> (t * 8);
@@ -1015,7 +1016,7 @@ __global__ void __launch_bounds__(128, 8)
   } else if (s->page.page_type != PageType::DICTIONARY_PAGE &&
              s->col.num_rep_level_bits() != 0  // This means there ARE repetition levels (has list)
   ) {
-    auto encode_levels = [&](uint8_t const *lvl_val_data, uint32_t nbits) {
+    auto encode_levels = [&](uint8_t const* lvl_val_data, uint32_t nbits) {
       // For list types, the repetition and definition levels are pre-calculated. We just need to
       // encode and write them now.
       if (!t) {
@@ -1040,8 +1041,8 @@ __global__ void __launch_bounds__(128, 8)
         __syncthreads();
       }
       if (t < 32) {
-        uint8_t *cur     = s->cur;
-        uint8_t *rle_out = s->rle_out;
+        uint8_t* cur     = s->cur;
+        uint8_t* rle_out = s->rle_out;
         if (t < 4) {
           uint32_t rle_bytes = (uint32_t)(rle_out - cur) - 4;
           cur[t]             = rle_bytes >> (t * 8);
@@ -1056,9 +1057,11 @@ __global__ void __launch_bounds__(128, 8)
   }
   // Encode data values
   __syncthreads();
-  dtype = s->col.physical_type;
-  dtype_len_out =
-    (dtype == INT96) ? 12 : (dtype == INT64 || dtype == DOUBLE) ? 8 : (dtype == BOOLEAN) ? 1 : 4;
+  dtype         = s->col.physical_type;
+  dtype_len_out = (dtype == INT96)                      ? 12
+                  : (dtype == INT64 || dtype == DOUBLE) ? 8
+                  : (dtype == BOOLEAN)                  ? 1
+                                                        : 4;
   if (dtype == INT32) {
     dtype_len_in = GetDtypeLogicalLen(s->col.leaf_column);
   } else if (dtype == INT96) {
@@ -1068,7 +1071,7 @@ __global__ void __launch_bounds__(128, 8)
   }
   dict_bits = (dtype == BOOLEAN) ? 1 : (s->page.dict_bits_plus1 - 1);
   if (t == 0) {
-    uint8_t *dst   = s->cur;
+    uint8_t* dst   = s->cur;
     s->rle_run     = 0;
     s->rle_pos     = 0;
     s->rle_numvals = 0;
@@ -1138,7 +1141,7 @@ __global__ void __launch_bounds__(128, 8)
       __syncthreads();
     } else {
       // Non-dictionary encoding
-      uint8_t *dst = s->cur;
+      uint8_t* dst = s->cur;
 
       if (is_valid) {
         len = dtype_len_out;
@@ -1250,7 +1253,7 @@ __global__ void __launch_bounds__(128, 8)
     }
   }
   if (t == 0) {
-    uint8_t *base                = s->page.page_data + s->page.max_hdr_size;
+    uint8_t* base                = s->page.page_data + s->page.max_hdr_size;
     uint32_t actual_data_size    = static_cast<uint32_t>(s->cur - base);
     uint32_t compressed_bfr_size = GetMaxCompressedBfrSize(actual_data_size);
     s->page.max_data_size        = actual_data_size;
@@ -1298,7 +1301,7 @@ __global__ void __launch_bounds__(128) gpuDecideCompression(device_span<EncColum
   if (t < 32) {
     num_pages = ck_g.num_pages;
     for (uint32_t page = t; page < num_pages; page += 32) {
-      auto &curr_page         = ck_g.pages[page];
+      auto& curr_page         = ck_g.pages[page];
       uint32_t page_data_size = curr_page.max_data_size;
       uncompressed_data_size += page_data_size;
       if (auto comp_status = curr_page.comp_stat; comp_status != nullptr) {
@@ -1329,7 +1332,7 @@ __global__ void __launch_bounds__(128) gpuDecideCompression(device_span<EncColum
 /**
  * Minimal thrift compact protocol support
  */
-inline __device__ uint8_t *cpw_put_uint32(uint8_t *p, uint32_t v)
+inline __device__ uint8_t* cpw_put_uint32(uint8_t* p, uint32_t v)
 {
   while (v > 0x7f) {
     *p++ = v | 0x80;
@@ -1339,7 +1342,7 @@ inline __device__ uint8_t *cpw_put_uint32(uint8_t *p, uint32_t v)
   return p;
 }
 
-inline __device__ uint8_t *cpw_put_uint64(uint8_t *p, uint64_t v)
+inline __device__ uint8_t* cpw_put_uint64(uint8_t* p, uint64_t v)
 {
   while (v > 0x7f) {
     *p++ = v | 0x80;
@@ -1349,19 +1352,19 @@ inline __device__ uint8_t *cpw_put_uint64(uint8_t *p, uint64_t v)
   return p;
 }
 
-inline __device__ uint8_t *cpw_put_int32(uint8_t *p, int32_t v)
+inline __device__ uint8_t* cpw_put_int32(uint8_t* p, int32_t v)
 {
   int32_t s = (v < 0);
   return cpw_put_uint32(p, (v ^ -s) * 2 + s);
 }
 
-inline __device__ uint8_t *cpw_put_int64(uint8_t *p, int64_t v)
+inline __device__ uint8_t* cpw_put_int64(uint8_t* p, int64_t v)
 {
   int64_t s = (v < 0);
   return cpw_put_uint64(p, (v ^ -s) * 2 + s);
 }
 
-inline __device__ uint8_t *cpw_put_fldh(uint8_t *p, int f, int cur, int t)
+inline __device__ uint8_t* cpw_put_fldh(uint8_t* p, int f, int cur, int t)
 {
   if (f > cur && f <= cur + 15) {
     *p++ = ((f - cur) << 4) | t;
@@ -1373,11 +1376,11 @@ inline __device__ uint8_t *cpw_put_fldh(uint8_t *p, int f, int cur, int t)
 }
 
 class header_encoder {
-  uint8_t *current_header_ptr;
+  uint8_t* current_header_ptr;
   int current_field_index;
 
  public:
-  inline __device__ header_encoder(uint8_t *header_start)
+  inline __device__ header_encoder(uint8_t* header_start)
     : current_header_ptr(header_start), current_field_index(0)
   {
   }
@@ -1411,7 +1414,7 @@ class header_encoder {
     current_field_index = field;
   }
 
-  inline __device__ void field_binary(int field, const void *value, uint32_t length)
+  inline __device__ void field_binary(int field, const void* value, uint32_t length)
   {
     current_header_ptr =
       cpw_put_fldh(current_header_ptr, field, current_field_index, ST_FLD_BINARY);
@@ -1421,21 +1424,21 @@ class header_encoder {
     current_field_index = field;
   }
 
-  inline __device__ void end(uint8_t **header_end, bool termination_flag = true)
+  inline __device__ void end(uint8_t** header_end, bool termination_flag = true)
   {
     if (termination_flag == false) { *current_header_ptr++ = 0; }
     *header_end = current_header_ptr;
   }
 
-  inline __device__ uint8_t *get_ptr(void) { return current_header_ptr; }
+  inline __device__ uint8_t* get_ptr(void) { return current_header_ptr; }
 
-  inline __device__ void set_ptr(uint8_t *ptr) { current_header_ptr = ptr; }
+  inline __device__ void set_ptr(uint8_t* ptr) { current_header_ptr = ptr; }
 };
 
-__device__ uint8_t *EncodeStatistics(uint8_t *start,
-                                     const statistics_chunk *s,
+__device__ uint8_t* EncodeStatistics(uint8_t* start,
+                                     const statistics_chunk* s,
                                      uint8_t dtype,
-                                     float *fp_scratch)
+                                     float* fp_scratch)
 {
   uint8_t *end, dtype_len;
   switch (dtype) {
@@ -1488,7 +1491,7 @@ __global__ void __launch_bounds__(128)
   gpuEncodePageHeaders(device_span<EncPage> pages,
                        device_span<gpu_inflate_status_s const> comp_stat,
                        device_span<statistics_chunk const> page_stats,
-                       const statistics_chunk *chunk_stats)
+                       const statistics_chunk* chunk_stats)
 {
   // When this whole kernel becomes single thread, the following variables need not be __shared__
   __shared__ __align__(8) parquet_column_device_view col_g;
@@ -1579,7 +1582,7 @@ __global__ void __launch_bounds__(1024)
 
   uint32_t t = threadIdx.x;
   uint8_t *dst, *dst_base;
-  const EncPage *first_page;
+  const EncPage* first_page;
   uint32_t num_pages, uncompressed_size;
 
   if (t == 0) ck_g = chunks[blockIdx.x];
@@ -1592,7 +1595,7 @@ __global__ void __launch_bounds__(1024)
   dst_base          = dst;
   uncompressed_size = ck_g.bfr_size;
   for (uint32_t page = 0; page < num_pages; page++) {
-    const uint8_t *src;
+    const uint8_t* src;
     uint32_t hdr_len, data_len;
 
     if (t == 0) { page_g = first_page[page]; }
@@ -1625,8 +1628,8 @@ __global__ void __launch_bounds__(1024)
  *
  */
 struct def_level_fn {
-  column_device_view const *parent_col;
-  uint8_t const *d_nullability;
+  column_device_view const* parent_col;
+  uint8_t const* d_nullability;
   uint8_t sub_level_start;
   uint8_t curr_def_level;
 
@@ -1757,12 +1760,14 @@ struct def_level_fn {
  */
 dremel_data get_dremel_data(column_view h_col,
                             // TODO(cp): use device_span once it is converted to a single hd_vec
-                            rmm::device_uvector<uint8_t> const &d_nullability,
-                            std::vector<uint8_t> const &nullability,
+                            rmm::device_uvector<uint8_t> const& d_nullability,
+                            std::vector<uint8_t> const& nullability,
                             rmm::cuda_stream_view stream)
 {
   auto get_list_level = [](column_view col) {
-    while (col.type().id() == type_id::STRUCT) { col = col.child(0); }
+    while (col.type().id() == type_id::STRUCT) {
+      col = col.child(0);
+    }
     return col;
   };
 
@@ -1832,7 +1837,7 @@ dremel_data get_dremel_data(column_view h_col,
   }
 
   std::unique_ptr<rmm::device_buffer> device_view_owners;
-  column_device_view *d_nesting_levels;
+  column_device_view* d_nesting_levels;
   std::tie(device_view_owners, d_nesting_levels) =
     contiguous_copy_column_device_views<column_device_view>(nesting_levels, stream);
 
@@ -2147,8 +2152,8 @@ void InitEncoderPages(device_2dspan<EncColumnChunk> chunks,
                       device_span<gpu::EncPage> pages,
                       device_span<parquet_column_device_view const> col_desc,
                       int32_t num_columns,
-                      statistics_merge_group *page_grstats,
-                      statistics_merge_group *chunk_grstats,
+                      statistics_merge_group* page_grstats,
+                      statistics_merge_group* chunk_grstats,
                       rmm::cuda_stream_view stream)
 {
   auto num_rowgroups = chunks.size().first;
@@ -2199,7 +2204,7 @@ void DecideCompression(device_span<EncColumnChunk> chunks, rmm::cuda_stream_view
 void EncodePageHeaders(device_span<EncPage> pages,
                        device_span<gpu_inflate_status_s const> comp_stat,
                        device_span<statistics_chunk const> page_stats,
-                       const statistics_chunk *chunk_stats,
+                       const statistics_chunk* chunk_stats,
                        rmm::cuda_stream_view stream)
 {
   // TODO: single thread task. No need for 128 threads/block. Earlier it used to employ rest of the
diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu
index bc10fd92566..a5536775116 100644
--- a/cpp/src/io/parquet/page_hdr.cu
+++ b/cpp/src/io/parquet/page_hdr.cu
@@ -45,9 +45,9 @@ static const __device__ __constant__ uint8_t g_list2struct[16] = {0,
                                                                   ST_FLD_LIST};
 
 struct byte_stream_s {
-  const uint8_t *cur;
-  const uint8_t *end;
-  const uint8_t *base;
+  const uint8_t* cur;
+  const uint8_t* end;
+  const uint8_t* base;
   // Parsed symbols
   PageType page_type;
   PageInfo page;
@@ -61,12 +61,12 @@ struct byte_stream_s {
  *
  * @return Current byte pointed to by the byte stream
  */
-inline __device__ unsigned int getb(byte_stream_s *bs)
+inline __device__ unsigned int getb(byte_stream_s* bs)
 {
   return (bs->cur < bs->end) ? *bs->cur++ : 0;
 }
 
-inline __device__ void skip_bytes(byte_stream_s *bs, size_t bytecnt)
+inline __device__ void skip_bytes(byte_stream_s* bs, size_t bytecnt)
 {
   bytecnt = min(bytecnt, (size_t)(bs->end - bs->cur));
   bs->cur += bytecnt;
@@ -83,7 +83,7 @@ inline __device__ void skip_bytes(byte_stream_s *bs, size_t bytecnt)
  *
  * @return Decoded 32 bit integer
  */
-__device__ uint32_t get_u32(byte_stream_s *bs)
+__device__ uint32_t get_u32(byte_stream_s* bs)
 {
   uint32_t v = 0, l = 0, c;
   do {
@@ -105,13 +105,13 @@ __device__ uint32_t get_u32(byte_stream_s *bs)
  *
  * @return Decoded 32 bit integer
  */
-inline __device__ int32_t get_i32(byte_stream_s *bs)
+inline __device__ int32_t get_i32(byte_stream_s* bs)
 {
   uint32_t u = get_u32(bs);
   return (int32_t)((u >> 1u) ^ -(int32_t)(u & 1));
 }
 
-__device__ void skip_struct_field(byte_stream_s *bs, int field_type)
+__device__ void skip_struct_field(byte_stream_s* bs, int field_type)
 {
   int struct_depth = 0;
   int rep_cnt      = 0;
@@ -161,11 +161,11 @@ __device__ void skip_struct_field(byte_stream_s *bs, int field_type)
  */
 struct ParquetFieldInt32 {
   int field;
-  int32_t &val;
+  int32_t& val;
 
-  __device__ ParquetFieldInt32(int f, int32_t &v) : field(f), val(v) {}
+  __device__ ParquetFieldInt32(int f, int32_t& v) : field(f), val(v) {}
 
-  inline __device__ bool operator()(byte_stream_s *bs, int field_type)
+  inline __device__ bool operator()(byte_stream_s* bs, int field_type)
   {
     val = get_i32(bs);
     return (field_type != ST_FLD_I32);
@@ -180,11 +180,11 @@ struct ParquetFieldInt32 {
 template <typename Enum>
 struct ParquetFieldEnum {
   int field;
-  Enum &val;
+  Enum& val;
 
-  __device__ ParquetFieldEnum(int f, Enum &v) : field(f), val(v) {}
+  __device__ ParquetFieldEnum(int f, Enum& v) : field(f), val(v) {}
 
-  inline __device__ bool operator()(byte_stream_s *bs, int field_type)
+  inline __device__ bool operator()(byte_stream_s* bs, int field_type)
   {
     val = static_cast<Enum>(get_i32(bs));
     return (field_type != ST_FLD_I32);
@@ -204,7 +204,7 @@ struct ParquetFieldStruct {
 
   __device__ ParquetFieldStruct(int f) : field(f) {}
 
-  inline __device__ bool operator()(byte_stream_s *bs, int field_type)
+  inline __device__ bool operator()(byte_stream_s* bs, int field_type)
   {
     return ((field_type != ST_FLD_STRUCT) || !op(bs));
   }
@@ -226,10 +226,10 @@ struct ParquetFieldStruct {
 template <int index>
 struct FunctionSwitchImpl {
   template <typename... Operator>
-  static inline __device__ bool run(byte_stream_s *bs,
+  static inline __device__ bool run(byte_stream_s* bs,
                                     int field_type,
-                                    const int &field,
-                                    thrust::tuple<Operator...> &ops)
+                                    const int& field,
+                                    thrust::tuple<Operator...>& ops)
   {
     if (field == thrust::get<index>(ops).field) {
       return thrust::get<index>(ops)(bs, field_type);
@@ -242,10 +242,10 @@ struct FunctionSwitchImpl {
 template <>
 struct FunctionSwitchImpl<0> {
   template <typename... Operator>
-  static inline __device__ bool run(byte_stream_s *bs,
+  static inline __device__ bool run(byte_stream_s* bs,
                                     int field_type,
-                                    const int &field,
-                                    thrust::tuple<Operator...> &ops)
+                                    const int& field,
+                                    thrust::tuple<Operator...>& ops)
   {
     if (field == thrust::get<0>(ops).field) {
       return thrust::get<0>(ops)(bs, field_type);
@@ -267,7 +267,7 @@ struct FunctionSwitchImpl<0> {
  * byte stream. Otherwise true is returned.
  */
 template <typename... Operator>
-inline __device__ bool parse_header(thrust::tuple<Operator...> &op, byte_stream_s *bs)
+inline __device__ bool parse_header(thrust::tuple<Operator...>& op, byte_stream_s* bs)
 {
   constexpr int index = thrust::tuple_size<thrust::tuple<Operator...>>::value - 1;
   int field           = 0;
@@ -284,7 +284,7 @@ inline __device__ bool parse_header(thrust::tuple<Operator...> &op, byte_stream_
 }
 
 struct gpuParseDataPageHeader {
-  __device__ bool operator()(byte_stream_s *bs)
+  __device__ bool operator()(byte_stream_s* bs)
   {
     auto op = thrust::make_tuple(ParquetFieldInt32(1, bs->page.num_input_values),
                                  ParquetFieldEnum<Encoding>(2, bs->page.encoding),
@@ -295,7 +295,7 @@ struct gpuParseDataPageHeader {
 };
 
 struct gpuParseDictionaryPageHeader {
-  __device__ bool operator()(byte_stream_s *bs)
+  __device__ bool operator()(byte_stream_s* bs)
   {
     auto op = thrust::make_tuple(ParquetFieldInt32(1, bs->page.num_input_values),
                                  ParquetFieldEnum<Encoding>(2, bs->page.encoding));
@@ -304,7 +304,7 @@ struct gpuParseDictionaryPageHeader {
 };
 
 struct gpuParseDataPageHeaderV2 {
-  __device__ bool operator()(byte_stream_s *bs)
+  __device__ bool operator()(byte_stream_s* bs)
   {
     auto op = thrust::make_tuple(ParquetFieldInt32(1, bs->page.num_input_values),
                                  ParquetFieldInt32(3, bs->page.num_rows),
@@ -316,7 +316,7 @@ struct gpuParseDataPageHeaderV2 {
 };
 
 struct gpuParsePageHeader {
-  __device__ bool operator()(byte_stream_s *bs)
+  __device__ bool operator()(byte_stream_s* bs)
   {
     auto op = thrust::make_tuple(ParquetFieldEnum<PageType>(1, bs->page_type),
                                  ParquetFieldInt32(2, bs->page.uncompressed_page_size),
@@ -336,14 +336,14 @@ struct gpuParsePageHeader {
  */
 // blockDim {128,1,1}
 extern "C" __global__ void __launch_bounds__(128)
-  gpuDecodePageHeaders(ColumnChunkDesc *chunks, int32_t num_chunks)
+  gpuDecodePageHeaders(ColumnChunkDesc* chunks, int32_t num_chunks)
 {
   gpuParsePageHeader parse_page_header;
   __shared__ byte_stream_s bs_g[4];
 
   int lane_id             = threadIdx.x % 32;
   int chunk               = (blockIdx.x * 4) + (threadIdx.x / 32);
-  byte_stream_s *const bs = &bs_g[threadIdx.x / 32];
+  byte_stream_s* const bs = &bs_g[threadIdx.x / 32];
 
   if (chunk < num_chunks and lane_id == 0) bs->ck = chunks[chunk];
   __syncthreads();
@@ -354,7 +354,7 @@ extern "C" __global__ void __launch_bounds__(128)
     uint32_t dictionary_page_count = 0;
     int32_t max_num_pages;
     int32_t num_dict_pages = bs->ck.num_dict_pages;
-    PageInfo *page_info;
+    PageInfo* page_info;
 
     if (!lane_id) {
       bs->base = bs->cur      = bs->ck.compressed_data;
@@ -402,7 +402,7 @@ extern "C" __global__ void __launch_bounds__(128)
               break;
             default: index_out = -1; break;
           }
-          bs->page.page_data = const_cast<uint8_t *>(bs->cur);
+          bs->page.page_data = const_cast<uint8_t*>(bs->cur);
           bs->cur += bs->page.compressed_page_size;
         } else {
           bs->cur = bs->end;
@@ -434,21 +434,21 @@ extern "C" __global__ void __launch_bounds__(128)
  */
 // blockDim {128,1,1}
 extern "C" __global__ void __launch_bounds__(128)
-  gpuBuildStringDictionaryIndex(ColumnChunkDesc *chunks, int32_t num_chunks)
+  gpuBuildStringDictionaryIndex(ColumnChunkDesc* chunks, int32_t num_chunks)
 {
   __shared__ ColumnChunkDesc chunk_g[4];
 
   int lane_id               = threadIdx.x % 32;
   int chunk                 = (blockIdx.x * 4) + (threadIdx.x / 32);
-  ColumnChunkDesc *const ck = &chunk_g[threadIdx.x / 32];
+  ColumnChunkDesc* const ck = &chunk_g[threadIdx.x / 32];
   if (chunk < num_chunks and lane_id == 0) *ck = chunks[chunk];
   __syncthreads();
 
   if (chunk >= num_chunks) { return; }
   if (!lane_id && ck->num_dict_pages > 0 && ck->str_dict_index) {
     // Data type to describe a string
-    string_index_pair *dict_index = ck->str_dict_index;
-    const uint8_t *dict           = ck->page_info[0].page_data;
+    string_index_pair* dict_index = ck->str_dict_index;
+    const uint8_t* dict           = ck->page_info[0].page_data;
     int dict_size                 = ck->page_info[0].uncompressed_page_size;
     int num_entries               = ck->page_info[0].num_input_values;
     int pos = 0, cur = 0;
@@ -464,13 +464,13 @@ extern "C" __global__ void __launch_bounds__(128)
         }
       }
       // TODO: Could store 8 entries in shared mem, then do a single warp-wide store
-      dict_index[i].first  = reinterpret_cast<const char *>(dict + pos + 4);
+      dict_index[i].first  = reinterpret_cast<const char*>(dict + pos + 4);
       dict_index[i].second = len;
     }
   }
 }
 
-void __host__ DecodePageHeaders(ColumnChunkDesc *chunks,
+void __host__ DecodePageHeaders(ColumnChunkDesc* chunks,
                                 int32_t num_chunks,
                                 rmm::cuda_stream_view stream)
 {
@@ -479,7 +479,7 @@ void __host__ DecodePageHeaders(ColumnChunkDesc *chunks,
   gpuDecodePageHeaders<<<dim_grid, dim_block, 0, stream.value()>>>(chunks, num_chunks);
 }
 
-void __host__ BuildStringDictionaryIndex(ColumnChunkDesc *chunks,
+void __host__ BuildStringDictionaryIndex(ColumnChunkDesc* chunks,
                                          int32_t num_chunks,
                                          rmm::cuda_stream_view stream)
 {
diff --git a/cpp/src/io/parquet/parquet.cpp b/cpp/src/io/parquet/parquet.cpp
index 2a1bd0d5a18..6c658788fa1 100644
--- a/cpp/src/io/parquet/parquet.cpp
+++ b/cpp/src/io/parquet/parquet.cpp
@@ -63,7 +63,8 @@ bool CompactProtocolReader::skip_struct_field(int t, int depth)
       if (n == 0xf) n = get_i32();
       t = g_list2struct[c & 0xf];
       if (depth > 10) return false;
-      for (int32_t i = 0; i < n; i++) skip_struct_field(t, depth + 1);
+      for (int32_t i = 0; i < n; i++)
+        skip_struct_field(t, depth + 1);
     } break;
     case ST_FLD_STRUCT:
       for (;;) {
@@ -84,10 +85,10 @@ bool CompactProtocolReader::skip_struct_field(int t, int depth)
 template <int index>
 struct FunctionSwitchImpl {
   template <typename... Operator>
-  static inline bool run(CompactProtocolReader *cpr,
+  static inline bool run(CompactProtocolReader* cpr,
                          int field_type,
-                         const int &field,
-                         std::tuple<Operator...> &ops)
+                         const int& field,
+                         std::tuple<Operator...>& ops)
   {
     if (field == std::get<index>(ops).field()) {
       return std::get<index>(ops)(cpr, field_type);
@@ -100,10 +101,10 @@ struct FunctionSwitchImpl {
 template <>
 struct FunctionSwitchImpl<0> {
   template <typename... Operator>
-  static inline bool run(CompactProtocolReader *cpr,
+  static inline bool run(CompactProtocolReader* cpr,
                          int field_type,
-                         const int &field,
-                         std::tuple<Operator...> &ops)
+                         const int& field,
+                         std::tuple<Operator...>& ops)
   {
     if (field == std::get<0>(ops).field()) {
       return std::get<0>(ops)(cpr, field_type);
@@ -115,7 +116,7 @@ struct FunctionSwitchImpl<0> {
 };
 
 template <typename... Operator>
-inline bool function_builder(CompactProtocolReader *cpr, std::tuple<Operator...> &op)
+inline bool function_builder(CompactProtocolReader* cpr, std::tuple<Operator...>& op)
 {
   constexpr int index = std::tuple_size<std::tuple<Operator...>>::value - 1;
   int field           = 0;
@@ -131,7 +132,7 @@ inline bool function_builder(CompactProtocolReader *cpr, std::tuple<Operator...>
   return true;
 }
 
-bool CompactProtocolReader::read(FileMetaData *f)
+bool CompactProtocolReader::read(FileMetaData* f)
 {
   auto op = std::make_tuple(ParquetFieldInt32(1, f->version),
                             ParquetFieldStructList(2, f->schema),
@@ -142,7 +143,7 @@ bool CompactProtocolReader::read(FileMetaData *f)
   return function_builder(this, op);
 }
 
-bool CompactProtocolReader::read(SchemaElement *s)
+bool CompactProtocolReader::read(SchemaElement* s)
 {
   auto op = std::make_tuple(ParquetFieldEnum<Type>(1, s->type),
                             ParquetFieldInt32(2, s->type_length),
@@ -156,7 +157,7 @@ bool CompactProtocolReader::read(SchemaElement *s)
   return function_builder(this, op);
 }
 
-bool CompactProtocolReader::read(LogicalType *l)
+bool CompactProtocolReader::read(LogicalType* l)
 {
   auto op =
     std::make_tuple(ParquetFieldUnion(1, l->isset.STRING, l->STRING),
@@ -174,40 +175,40 @@ bool CompactProtocolReader::read(LogicalType *l)
   return function_builder(this, op);
 }
 
-bool CompactProtocolReader::read(DecimalType *d)
+bool CompactProtocolReader::read(DecimalType* d)
 {
   auto op = std::make_tuple(ParquetFieldInt32(1, d->scale), ParquetFieldInt32(2, d->precision));
   return function_builder(this, op);
 }
 
-bool CompactProtocolReader::read(TimeType *t)
+bool CompactProtocolReader::read(TimeType* t)
 {
   auto op =
     std::make_tuple(ParquetFieldBool(1, t->isAdjustedToUTC), ParquetFieldStruct(2, t->unit));
   return function_builder(this, op);
 }
 
-bool CompactProtocolReader::read(TimestampType *t)
+bool CompactProtocolReader::read(TimestampType* t)
 {
   auto op =
     std::make_tuple(ParquetFieldBool(1, t->isAdjustedToUTC), ParquetFieldStruct(2, t->unit));
   return function_builder(this, op);
 }
 
-bool CompactProtocolReader::read(TimeUnit *u)
+bool CompactProtocolReader::read(TimeUnit* u)
 {
   auto op = std::make_tuple(ParquetFieldUnion(1, u->isset.MILLIS, u->MILLIS),
                             ParquetFieldUnion(2, u->isset.MICROS, u->MICROS));
   return function_builder(this, op);
 }
 
-bool CompactProtocolReader::read(IntType *i)
+bool CompactProtocolReader::read(IntType* i)
 {
   auto op = std::make_tuple(ParquetFieldInt8(1, i->bitWidth), ParquetFieldBool(2, i->isSigned));
   return function_builder(this, op);
 }
 
-bool CompactProtocolReader::read(RowGroup *r)
+bool CompactProtocolReader::read(RowGroup* r)
 {
   auto op = std::make_tuple(ParquetFieldStructList(1, r->columns),
                             ParquetFieldInt64(2, r->total_byte_size),
@@ -215,7 +216,7 @@ bool CompactProtocolReader::read(RowGroup *r)
   return function_builder(this, op);
 }
 
-bool CompactProtocolReader::read(ColumnChunk *c)
+bool CompactProtocolReader::read(ColumnChunk* c)
 {
   auto op = std::make_tuple(ParquetFieldString(1, c->file_path),
                             ParquetFieldInt64(2, c->file_offset),
@@ -227,7 +228,7 @@ bool CompactProtocolReader::read(ColumnChunk *c)
   return function_builder(this, op);
 }
 
-bool CompactProtocolReader::read(ColumnChunkMetaData *c)
+bool CompactProtocolReader::read(ColumnChunkMetaData* c)
 {
   auto op = std::make_tuple(ParquetFieldEnum<Type>(1, c->type),
                             ParquetFieldEnumList(2, c->encodings),
@@ -243,7 +244,7 @@ bool CompactProtocolReader::read(ColumnChunkMetaData *c)
   return function_builder(this, op);
 }
 
-bool CompactProtocolReader::read(PageHeader *p)
+bool CompactProtocolReader::read(PageHeader* p)
 {
   auto op = std::make_tuple(ParquetFieldEnum<PageType>(1, p->type),
                             ParquetFieldInt32(2, p->uncompressed_page_size),
@@ -253,7 +254,7 @@ bool CompactProtocolReader::read(PageHeader *p)
   return function_builder(this, op);
 }
 
-bool CompactProtocolReader::read(DataPageHeader *d)
+bool CompactProtocolReader::read(DataPageHeader* d)
 {
   auto op = std::make_tuple(ParquetFieldInt32(1, d->num_values),
                             ParquetFieldEnum<Encoding>(2, d->encoding),
@@ -262,14 +263,14 @@ bool CompactProtocolReader::read(DataPageHeader *d)
   return function_builder(this, op);
 }
 
-bool CompactProtocolReader::read(DictionaryPageHeader *d)
+bool CompactProtocolReader::read(DictionaryPageHeader* d)
 {
   auto op = std::make_tuple(ParquetFieldInt32(1, d->num_values),
                             ParquetFieldEnum<Encoding>(2, d->encoding));
   return function_builder(this, op);
 }
 
-bool CompactProtocolReader::read(KeyValue *k)
+bool CompactProtocolReader::read(KeyValue* k)
 {
   auto op = std::make_tuple(ParquetFieldString(1, k->key), ParquetFieldString(2, k->value));
   return function_builder(this, op);
@@ -282,24 +283,24 @@ bool CompactProtocolReader::read(KeyValue *k)
  *
  * @return True if schema constructed completely, false otherwise
  */
-bool CompactProtocolReader::InitSchema(FileMetaData *md)
+bool CompactProtocolReader::InitSchema(FileMetaData* md)
 {
   if (static_cast<std::size_t>(WalkSchema(md)) != md->schema.size()) return false;
 
   /* Inside FileMetaData, there is a std::vector of RowGroups and each RowGroup contains a
    * a std::vector of ColumnChunks. Each ColumnChunk has a member ColumnMetaData, which contains
    * a std::vector of std::strings representing paths. The purpose of the code below is to set the
-   * schema_idx of each column of each row to it corresonding row_group. This is effectively
+   * schema_idx of each column of each row to it corresponding row_group. This is effectively
    * mapping the columns to the schema.
    */
-  for (auto &row_group : md->row_groups) {
+  for (auto& row_group : md->row_groups) {
     int current_schema_index = 0;
-    for (auto &column : row_group.columns) {
+    for (auto& column : row_group.columns) {
       int parent = 0;  // root of schema
-      for (auto const &path : column.meta_data.path_in_schema) {
+      for (auto const& path : column.meta_data.path_in_schema) {
         auto const it = [&] {
           // find_if starting at (current_schema_index + 1) and then wrapping
-          auto schema = [&](auto const &e) { return e.parent_idx == parent && e.name == path; };
+          auto schema = [&](auto const& e) { return e.parent_idx == parent && e.name == path; };
           auto mid    = md->schema.cbegin() + current_schema_index + 1;
           auto it     = std::find_if(mid, md->schema.cend(), schema);
           if (it != md->schema.cend()) return it;
@@ -328,10 +329,10 @@ bool CompactProtocolReader::InitSchema(FileMetaData *md)
  * @return The node index that was populated
  */
 int CompactProtocolReader::WalkSchema(
-  FileMetaData *md, int idx, int parent_idx, int max_def_level, int max_rep_level)
+  FileMetaData* md, int idx, int parent_idx, int max_def_level, int max_rep_level)
 {
   if (idx >= 0 && (size_t)idx < md->schema.size()) {
-    SchemaElement *e = &md->schema[idx];
+    SchemaElement* e = &md->schema[idx];
     if (e->repetition_type == OPTIONAL) {
       ++max_def_level;
     } else if (e->repetition_type == REPEATED) {
diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp
index eefff518a9a..2232017409d 100644
--- a/cpp/src/io/parquet/parquet.hpp
+++ b/cpp/src/io/parquet/parquet.hpp
@@ -166,7 +166,7 @@ struct SchemaElement {
   int max_repetition_level = 0;
   int parent_idx           = 0;
 
-  bool operator==(SchemaElement const &other) const
+  bool operator==(SchemaElement const& other) const
   {
     return type == other.type && converted_type == other.converted_type &&
            type_length == other.type_length && repetition_type == other.repetition_type &&
@@ -232,7 +232,7 @@ struct ColumnChunkMetaData {
  * column
  *
  * Each column chunk lives in a particular row group and are guaranteed to be
- * contiguous in the file. Any mssing or corrupted chunks can be skipped during
+ * contiguous in the file. Any missing or corrupted chunks can be skipped during
  * reading.
  */
 struct ColumnChunk {
@@ -356,8 +356,8 @@ class CompactProtocolReader {
   static const uint8_t g_list2struct[16];
 
  public:
-  explicit CompactProtocolReader(const uint8_t *base = nullptr, size_t len = 0) { init(base, len); }
-  void init(const uint8_t *base, size_t len)
+  explicit CompactProtocolReader(const uint8_t* base = nullptr, size_t len = 0) { init(base, len); }
+  void init(const uint8_t* base, size_t len)
   {
     m_base = m_cur = base;
     m_end          = base + len;
@@ -400,7 +400,7 @@ class CompactProtocolReader {
     uint64_t u = get_u64();
     return (int64_t)((u >> 1u) ^ -(int64_t)(u & 1));
   }
-  int32_t get_listh(uint8_t *el_type) noexcept
+  int32_t get_listh(uint8_t* el_type) noexcept
   {
     uint32_t c = getb();
     int32_t sz = c >> 4;
@@ -412,40 +412,40 @@ class CompactProtocolReader {
 
  public:
   // Generate Thrift structure parsing routines
-  bool read(FileMetaData *f);
-  bool read(SchemaElement *s);
-  bool read(LogicalType *l);
-  bool read(DecimalType *d);
-  bool read(TimeType *t);
-  bool read(TimeUnit *u);
-  bool read(TimestampType *t);
-  bool read(IntType *t);
-  bool read(RowGroup *r);
-  bool read(ColumnChunk *c);
-  bool read(ColumnChunkMetaData *c);
-  bool read(PageHeader *p);
-  bool read(DataPageHeader *d);
-  bool read(DictionaryPageHeader *d);
-  bool read(KeyValue *k);
+  bool read(FileMetaData* f);
+  bool read(SchemaElement* s);
+  bool read(LogicalType* l);
+  bool read(DecimalType* d);
+  bool read(TimeType* t);
+  bool read(TimeUnit* u);
+  bool read(TimestampType* t);
+  bool read(IntType* t);
+  bool read(RowGroup* r);
+  bool read(ColumnChunk* c);
+  bool read(ColumnChunkMetaData* c);
+  bool read(PageHeader* p);
+  bool read(DataPageHeader* d);
+  bool read(DictionaryPageHeader* d);
+  bool read(KeyValue* k);
 
  public:
   static int NumRequiredBits(uint32_t max_level) noexcept
   {
     return 32 - CountLeadingZeros32(max_level);
   }
-  bool InitSchema(FileMetaData *md);
+  bool InitSchema(FileMetaData* md);
 
  protected:
-  int WalkSchema(FileMetaData *md,
+  int WalkSchema(FileMetaData* md,
                  int idx           = 0,
                  int parent_idx    = 0,
                  int max_def_level = 0,
                  int max_rep_level = 0);
 
  protected:
-  const uint8_t *m_base = nullptr;
-  const uint8_t *m_cur  = nullptr;
-  const uint8_t *m_end  = nullptr;
+  const uint8_t* m_base = nullptr;
+  const uint8_t* m_cur  = nullptr;
+  const uint8_t* m_end  = nullptr;
 
   friend class ParquetFieldBool;
   friend class ParquetFieldInt8;
@@ -473,12 +473,12 @@ class CompactProtocolReader {
  */
 class ParquetFieldBool {
   int field_val;
-  bool &val;
+  bool& val;
 
  public:
-  ParquetFieldBool(int f, bool &v) : field_val(f), val(v) {}
+  ParquetFieldBool(int f, bool& v) : field_val(f), val(v) {}
 
-  inline bool operator()(CompactProtocolReader *cpr, int field_type)
+  inline bool operator()(CompactProtocolReader* cpr, int field_type)
   {
     return (field_type != ST_FLD_TRUE && field_type != ST_FLD_FALSE) ||
            !(val = (field_type == ST_FLD_TRUE), true);
@@ -494,12 +494,12 @@ class ParquetFieldBool {
  */
 class ParquetFieldInt8 {
   int field_val;
-  int8_t &val;
+  int8_t& val;
 
  public:
-  ParquetFieldInt8(int f, int8_t &v) : field_val(f), val(v) {}
+  ParquetFieldInt8(int f, int8_t& v) : field_val(f), val(v) {}
 
-  inline bool operator()(CompactProtocolReader *cpr, int field_type)
+  inline bool operator()(CompactProtocolReader* cpr, int field_type)
   {
     val = cpr->getb();
     return (field_type != ST_FLD_BYTE);
@@ -515,12 +515,12 @@ class ParquetFieldInt8 {
  */
 class ParquetFieldInt32 {
   int field_val;
-  int32_t &val;
+  int32_t& val;
 
  public:
-  ParquetFieldInt32(int f, int32_t &v) : field_val(f), val(v) {}
+  ParquetFieldInt32(int f, int32_t& v) : field_val(f), val(v) {}
 
-  inline bool operator()(CompactProtocolReader *cpr, int field_type)
+  inline bool operator()(CompactProtocolReader* cpr, int field_type)
   {
     val = cpr->get_i32();
     return (field_type != ST_FLD_I32);
@@ -536,12 +536,12 @@ class ParquetFieldInt32 {
  */
 class ParquetFieldInt64 {
   int field_val;
-  int64_t &val;
+  int64_t& val;
 
  public:
-  ParquetFieldInt64(int f, int64_t &v) : field_val(f), val(v) {}
+  ParquetFieldInt64(int f, int64_t& v) : field_val(f), val(v) {}
 
-  inline bool operator()(CompactProtocolReader *cpr, int field_type)
+  inline bool operator()(CompactProtocolReader* cpr, int field_type)
   {
     val = cpr->get_i64();
     return (field_type < ST_FLD_I16 || field_type > ST_FLD_I64);
@@ -559,12 +559,12 @@ class ParquetFieldInt64 {
 template <typename T>
 class ParquetFieldStructListFunctor {
   int field_val;
-  std::vector<T> &val;
+  std::vector<T>& val;
 
  public:
-  ParquetFieldStructListFunctor(int f, std::vector<T> &v) : field_val(f), val(v) {}
+  ParquetFieldStructListFunctor(int f, std::vector<T>& v) : field_val(f), val(v) {}
 
-  inline bool operator()(CompactProtocolReader *cpr, int field_type)
+  inline bool operator()(CompactProtocolReader* cpr, int field_type)
   {
     if (field_type != ST_FLD_LIST) return true;
 
@@ -584,7 +584,7 @@ class ParquetFieldStructListFunctor {
 };
 
 template <typename T>
-ParquetFieldStructListFunctor<T> ParquetFieldStructList(int f, std::vector<T> &v)
+ParquetFieldStructListFunctor<T> ParquetFieldStructList(int f, std::vector<T>& v)
 {
   return ParquetFieldStructListFunctor<T>(f, v);
 }
@@ -597,17 +597,17 @@ ParquetFieldStructListFunctor<T> ParquetFieldStructList(int f, std::vector<T> &v
  */
 class ParquetFieldString {
   int field_val;
-  std::string &val;
+  std::string& val;
 
  public:
-  ParquetFieldString(int f, std::string &v) : field_val(f), val(v) {}
+  ParquetFieldString(int f, std::string& v) : field_val(f), val(v) {}
 
-  inline bool operator()(CompactProtocolReader *cpr, int field_type)
+  inline bool operator()(CompactProtocolReader* cpr, int field_type)
   {
     if (field_type != ST_FLD_BINARY) return true;
     uint32_t n = cpr->get_u32();
     if (n < (size_t)(cpr->m_end - cpr->m_cur)) {
-      val.assign((const char *)cpr->m_cur, n);
+      val.assign((const char*)cpr->m_cur, n);
       cpr->m_cur += n;
       return false;
     } else {
@@ -627,12 +627,12 @@ class ParquetFieldString {
 template <typename T>
 class ParquetFieldStructFunctor {
   int field_val;
-  T &val;
+  T& val;
 
  public:
-  ParquetFieldStructFunctor(int f, T &v) : field_val(f), val(v) {}
+  ParquetFieldStructFunctor(int f, T& v) : field_val(f), val(v) {}
 
-  inline bool operator()(CompactProtocolReader *cpr, int field_type)
+  inline bool operator()(CompactProtocolReader* cpr, int field_type)
   {
     return (field_type != ST_FLD_STRUCT || !(cpr->read(&val)));
   }
@@ -641,7 +641,7 @@ class ParquetFieldStructFunctor {
 };
 
 template <typename T>
-ParquetFieldStructFunctor<T> ParquetFieldStruct(int f, T &v)
+ParquetFieldStructFunctor<T> ParquetFieldStruct(int f, T& v)
 {
   return ParquetFieldStructFunctor<T>(f, v);
 }
@@ -657,13 +657,13 @@ ParquetFieldStructFunctor<T> ParquetFieldStruct(int f, T &v)
 template <typename T, bool is_empty = false>
 class ParquetFieldUnionFunctor {
   int field_val;
-  bool &is_set;
-  T &val;
+  bool& is_set;
+  T& val;
 
  public:
-  ParquetFieldUnionFunctor(int f, bool &b, T &v) : field_val(f), is_set(b), val(v) {}
+  ParquetFieldUnionFunctor(int f, bool& b, T& v) : field_val(f), is_set(b), val(v) {}
 
-  inline bool operator()(CompactProtocolReader *cpr, int field_type)
+  inline bool operator()(CompactProtocolReader* cpr, int field_type)
   {
     if (field_type != ST_FLD_STRUCT) {
       return true;
@@ -679,13 +679,13 @@ class ParquetFieldUnionFunctor {
 template <typename T>
 struct ParquetFieldUnionFunctor<T, true> {
   int field_val;
-  bool &is_set;
-  T &val;
+  bool& is_set;
+  T& val;
 
  public:
-  ParquetFieldUnionFunctor(int f, bool &b, T &v) : field_val(f), is_set(b), val(v) {}
+  ParquetFieldUnionFunctor(int f, bool& b, T& v) : field_val(f), is_set(b), val(v) {}
 
-  inline bool operator()(CompactProtocolReader *cpr, int field_type)
+  inline bool operator()(CompactProtocolReader* cpr, int field_type)
   {
     if (field_type != ST_FLD_STRUCT) {
       return true;
@@ -700,7 +700,7 @@ struct ParquetFieldUnionFunctor<T, true> {
 };
 
 template <typename T>
-ParquetFieldUnionFunctor<T, std::is_empty<T>::value> ParquetFieldUnion(int f, bool &b, T &v)
+ParquetFieldUnionFunctor<T, std::is_empty<T>::value> ParquetFieldUnion(int f, bool& b, T& v)
 {
   return ParquetFieldUnionFunctor<T, std::is_empty<T>::value>(f, b, v);
 }
@@ -713,11 +713,11 @@ ParquetFieldUnionFunctor<T, std::is_empty<T>::value> ParquetFieldUnion(int f, bo
 template <typename Enum>
 class ParquetFieldEnum {
   int field_val;
-  Enum &val;
+  Enum& val;
 
  public:
-  ParquetFieldEnum(int f, Enum &v) : field_val(f), val(v) {}
-  inline bool operator()(CompactProtocolReader *cpr, int field_type)
+  ParquetFieldEnum(int f, Enum& v) : field_val(f), val(v) {}
+  inline bool operator()(CompactProtocolReader* cpr, int field_type)
   {
     val = static_cast<Enum>(cpr->get_i32());
     return (field_type != ST_FLD_I32);
@@ -735,11 +735,11 @@ class ParquetFieldEnum {
 template <typename Enum>
 class ParquetFieldEnumListFunctor {
   int field_val;
-  std::vector<Enum> &val;
+  std::vector<Enum>& val;
 
  public:
-  ParquetFieldEnumListFunctor(int f, std::vector<Enum> &v) : field_val(f), val(v) {}
-  inline bool operator()(CompactProtocolReader *cpr, int field_type)
+  ParquetFieldEnumListFunctor(int f, std::vector<Enum>& v) : field_val(f), val(v) {}
+  inline bool operator()(CompactProtocolReader* cpr, int field_type)
   {
     if (field_type != ST_FLD_LIST) return true;
     int current_byte = cpr->getb();
@@ -747,7 +747,9 @@ class ParquetFieldEnumListFunctor {
     int n = current_byte >> 4;
     if (n == 0xf) n = cpr->get_u32();
     val.resize(n);
-    for (int32_t i = 0; i < n; i++) { val[i] = static_cast<Enum>(cpr->get_i32()); }
+    for (int32_t i = 0; i < n; i++) {
+      val[i] = static_cast<Enum>(cpr->get_i32());
+    }
     return false;
   }
 
@@ -755,7 +757,7 @@ class ParquetFieldEnumListFunctor {
 };
 
 template <typename T>
-ParquetFieldEnumListFunctor<T> ParquetFieldEnumList(int field, std::vector<T> &v)
+ParquetFieldEnumListFunctor<T> ParquetFieldEnumList(int field, std::vector<T>& v)
 {
   return ParquetFieldEnumListFunctor<T>(field, v);
 }
@@ -768,11 +770,11 @@ ParquetFieldEnumListFunctor<T> ParquetFieldEnumList(int field, std::vector<T> &v
  */
 class ParquetFieldStringList {
   int field_val;
-  std::vector<std::string> &val;
+  std::vector<std::string>& val;
 
  public:
-  ParquetFieldStringList(int f, std::vector<std::string> &v) : field_val(f), val(v) {}
-  inline bool operator()(CompactProtocolReader *cpr, int field_type)
+  ParquetFieldStringList(int f, std::vector<std::string>& v) : field_val(f), val(v) {}
+  inline bool operator()(CompactProtocolReader* cpr, int field_type)
   {
     if (field_type != ST_FLD_LIST) return true;
     int current_byte = cpr->getb();
@@ -783,7 +785,7 @@ class ParquetFieldStringList {
     for (int32_t i = 0; i < n; i++) {
       uint32_t l = cpr->get_u32();
       if (l < (size_t)(cpr->m_end - cpr->m_cur)) {
-        val[i].assign((const char *)cpr->m_cur, l);
+        val[i].assign((const char*)cpr->m_cur, l);
         cpr->m_cur += l;
       } else
         return true;
@@ -801,14 +803,14 @@ class ParquetFieldStringList {
  */
 class ParquetFieldStructBlob {
   int field_val;
-  std::vector<uint8_t> &val;
+  std::vector<uint8_t>& val;
 
  public:
-  ParquetFieldStructBlob(int f, std::vector<uint8_t> &v) : field_val(f), val(v) {}
-  inline bool operator()(CompactProtocolReader *cpr, int field_type)
+  ParquetFieldStructBlob(int f, std::vector<uint8_t>& v) : field_val(f), val(v) {}
+  inline bool operator()(CompactProtocolReader* cpr, int field_type)
   {
     if (field_type != ST_FLD_STRUCT) return true;
-    const uint8_t *start = cpr->m_cur;
+    const uint8_t* start = cpr->m_cur;
     cpr->skip_struct_field(field_type);
     if (cpr->m_cur > start) { val.assign(start, cpr->m_cur - 1); }
     return false;
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 1b6bb9ad7ca..abd7ccef523 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -95,15 +95,15 @@ struct PageNestingInfo {
   int32_t value_count;       // total # of values decoded in this page/nesting-level
   int32_t null_count;        // null count
   int32_t valid_map_offset;  // current offset in bits relative to valid_map
-  uint8_t *data_out;         // pointer into output buffer
-  uint32_t *valid_map;       // pointer into output validity buffer
+  uint8_t* data_out;         // pointer into output buffer
+  uint32_t* valid_map;       // pointer into output validity buffer
 };
 
 /**
  * @brief Struct describing a particular page of column chunk data
  */
 struct PageInfo {
-  uint8_t *page_data;  // Compressed page data before decompression, or uncompressed data after
+  uint8_t* page_data;  // Compressed page data before decompression, or uncompressed data after
                        // decompression
   int32_t compressed_page_size;    // compressed data size in bytes
   int32_t uncompressed_page_size;  // uncompressed data size in bytes
@@ -139,7 +139,7 @@ struct PageInfo {
 
   // nesting information (input/output) for each page
   int num_nesting_levels;
-  PageNestingInfo *nesting;
+  PageNestingInfo* nesting;
 };
 
 /**
@@ -148,7 +148,7 @@ struct PageInfo {
 struct ColumnChunkDesc {
   ColumnChunkDesc() = default;
   explicit constexpr ColumnChunkDesc(size_t compressed_size_,
-                                     uint8_t *compressed_data_,
+                                     uint8_t* compressed_data_,
                                      size_t num_values_,
                                      uint16_t datatype_,
                                      uint16_t datatype_length_,
@@ -190,7 +190,7 @@ struct ColumnChunkDesc {
   {
   }
 
-  uint8_t const *compressed_data;                  // pointer to compressed column chunk data
+  uint8_t const* compressed_data;                  // pointer to compressed column chunk data
   size_t compressed_size;                          // total compressed data size for this chunk
   size_t num_values;                               // total number of values in this column
   size_t start_row;                                // starting row of this chunk
@@ -204,11 +204,11 @@ struct ColumnChunkDesc {
   int32_t num_data_pages;                     // number of data pages
   int32_t num_dict_pages;                     // number of dictionary pages
   int32_t max_num_pages;                      // size of page_info array
-  PageInfo *page_info;                        // output page info for up to num_dict_pages +
+  PageInfo* page_info;                        // output page info for up to num_dict_pages +
                                               // num_data_pages (dictionary pages first)
-  string_index_pair *str_dict_index;          // index for string dictionary
-  uint32_t **valid_map_base;                  // base pointers of valid bit map for this column
-  void **column_data_base;                    // base pointers of column data
+  string_index_pair* str_dict_index;          // index for string dictionary
+  uint32_t** valid_map_base;                  // base pointers of valid bit map for this column
+  void** column_data_base;                    // base pointers of column data
   int8_t codec;                               // compressed codec enum
   int8_t converted_type;                      // converted type enum
   int8_t decimal_scale;                       // decimal scale pow(10, -decimal_scale)
@@ -222,21 +222,21 @@ struct ColumnChunkDesc {
  * @brief Struct describing an encoder column
  */
 struct parquet_column_device_view : stats_column_desc {
-  uint32_t *dict_index;    //!< Dictionary index [row]
-  uint32_t *dict_data;     //!< Dictionary data (unique row indices)
+  uint32_t* dict_index;    //!< Dictionary index [row]
+  uint32_t* dict_data;     //!< Dictionary data (unique row indices)
   uint8_t physical_type;   //!< physical data type
   uint8_t converted_type;  //!< logical data type
   uint8_t level_bits;  //!< bits to encode max definition (lower nibble) & repetition (upper nibble)
                        //!< levels
   constexpr uint8_t num_def_level_bits() { return level_bits & 0xf; }
   constexpr uint8_t num_rep_level_bits() { return level_bits >> 4; }
-  size_type const *const
-    *nesting_offsets;  //!< If column is a nested type, contains offset array of each nesting level
+  size_type const* const*
+    nesting_offsets;  //!< If column is a nested type, contains offset array of each nesting level
 
-  size_type const *level_offsets;  //!< Offset array for per-row pre-calculated rep/def level values
-  uint8_t const *rep_values;       //!< Pre-calculated repetition level values
-  uint8_t const *def_values;       //!< Pre-calculated definition level values
-  uint8_t *nullability;  //!< Array of nullability of each nesting level. e.g. nullable[0] is
+  size_type const* level_offsets;  //!< Offset array for per-row pre-calculated rep/def level values
+  uint8_t const* rep_values;       //!< Pre-calculated repetition level values
+  uint8_t const* def_values;       //!< Pre-calculated definition level values
+  uint8_t* nullability;  //!< Array of nullability of each nesting level. e.g. nullable[0] is
                          //!< nullability of parent_column. May be different from col.nullable() in
                          //!< case of chunked writing.
 };
@@ -265,7 +265,7 @@ constexpr size_t kDictScratchSize    = (1 << kDictHashBits) * sizeof(uint32_t);
 /**
  * @brief Return the byte length of parquet dtypes that are physically represented by INT32
  */
-inline uint32_t __device__ GetDtypeLogicalLen(column_device_view *col)
+inline uint32_t __device__ GetDtypeLogicalLen(column_device_view* col)
 {
   switch (col->type().id()) {
     case cudf::type_id::INT8:
@@ -291,18 +291,18 @@ struct EncPage;
  * @brief Struct describing an encoder column chunk
  */
 struct EncColumnChunk {
-  parquet_column_device_view const *col_desc;  //!< Column description
-  PageFragment *fragments;                     //!< First fragment in chunk
-  uint8_t *uncompressed_bfr;                   //!< Uncompressed page data
-  uint8_t *compressed_bfr;                     //!< Compressed page data
-  statistics_chunk const *stats;               //!< Fragment statistics
+  parquet_column_device_view const* col_desc;  //!< Column description
+  PageFragment* fragments;                     //!< First fragment in chunk
+  uint8_t* uncompressed_bfr;                   //!< Uncompressed page data
+  uint8_t* compressed_bfr;                     //!< Compressed page data
+  statistics_chunk const* stats;               //!< Fragment statistics
   uint32_t bfr_size;                           //!< Uncompressed buffer size
   uint32_t compressed_size;                    //!< Compressed buffer size
   uint32_t start_row;                          //!< First row of chunk
   uint32_t num_rows;                           //!< Number of rows in chunk
   uint32_t num_values;      //!< Number of values in chunk. Different from num_rows for nested types
   uint32_t first_fragment;  //!< First fragment of chunk
-  EncPage *pages;           //!< Ptr to pages that belong to this chunk
+  EncPage* pages;           //!< Ptr to pages that belong to this chunk
   uint32_t first_page;      //!< First page of chunk
   uint32_t num_pages;       //!< Number of pages in chunk
   uint32_t dictionary_id;   //!< Dictionary id for this chunk
@@ -318,12 +318,12 @@ struct EncColumnChunk {
  * @brief Struct describing an encoder data page
  */
 struct EncPage {
-  uint8_t *page_data;        //!< Ptr to uncompressed page
-  uint8_t *compressed_data;  //!< Ptr to compressed page
+  uint8_t* page_data;        //!< Ptr to uncompressed page
+  uint8_t* compressed_data;  //!< Ptr to compressed page
   uint16_t num_fragments;    //!< Number of fragments in page
   PageType page_type;        //!< Page type
   uint8_t dict_bits_plus1;   //!< 0=plain, nonzero:bits to encoding dictionary indices + 1
-  EncColumnChunk *chunk;     //!< Chunk that this page belongs to
+  EncColumnChunk* chunk;     //!< Chunk that this page belongs to
   uint32_t chunk_id;         //!< Index in chunk array
   uint32_t hdr_size;         //!< Size of page header
   uint32_t max_hdr_size;     //!< Maximum size of page header
@@ -333,7 +333,7 @@ struct EncPage {
   uint32_t num_leaf_values;  //!< Values in page. Different from num_rows in case of nested types
   uint32_t num_values;  //!< Number of def/rep level values in page. Includes null/empty elements in
                         //!< non-leaf levels
-  gpu_inflate_status_s *comp_stat;  //!< Ptr to compression status
+  gpu_inflate_status_s* comp_stat;  //!< Ptr to compression status
 };
 
 /**
@@ -343,7 +343,7 @@ struct EncPage {
  * @param[in] num_chunks Number of column chunks
  * @param[in] stream CUDA stream to use, default 0
  */
-void DecodePageHeaders(ColumnChunkDesc *chunks, int32_t num_chunks, rmm::cuda_stream_view stream);
+void DecodePageHeaders(ColumnChunkDesc* chunks, int32_t num_chunks, rmm::cuda_stream_view stream);
 
 /**
  * @brief Launches kernel for building the dictionary index for the column
@@ -353,7 +353,7 @@ void DecodePageHeaders(ColumnChunkDesc *chunks, int32_t num_chunks, rmm::cuda_st
  * @param[in] num_chunks Number of column chunks
  * @param[in] stream CUDA stream to use, default 0
  */
-void BuildStringDictionaryIndex(ColumnChunkDesc *chunks,
+void BuildStringDictionaryIndex(ColumnChunkDesc* chunks,
                                 int32_t num_chunks,
                                 rmm::cuda_stream_view stream);
 
@@ -376,14 +376,14 @@ void BuildStringDictionaryIndex(ColumnChunkDesc *chunks,
  * @param[in] min_rows crop all rows below min_row
  * @param[in] stream Cuda stream
  */
-void PreprocessColumnData(hostdevice_vector<PageInfo> &pages,
-                          hostdevice_vector<ColumnChunkDesc> const &chunks,
-                          std::vector<input_column_info> &input_columns,
-                          std::vector<cudf::io::detail::column_buffer> &output_columns,
+void PreprocessColumnData(hostdevice_vector<PageInfo>& pages,
+                          hostdevice_vector<ColumnChunkDesc> const& chunks,
+                          std::vector<input_column_info>& input_columns,
+                          std::vector<cudf::io::detail::column_buffer>& output_columns,
                           size_t num_rows,
                           size_t min_row,
                           rmm::cuda_stream_view stream,
-                          rmm::mr::device_memory_resource *mr);
+                          rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Launches kernel for reading the column data stored in the pages
@@ -397,8 +397,8 @@ void PreprocessColumnData(hostdevice_vector<PageInfo> &pages,
  * @param[in] min_row Minimum number of rows to read
  * @param[in] stream CUDA stream to use, default 0
  */
-void DecodePageData(hostdevice_vector<PageInfo> &pages,
-                    hostdevice_vector<ColumnChunkDesc> const &chunks,
+void DecodePageData(hostdevice_vector<PageInfo>& pages,
+                    hostdevice_vector<ColumnChunkDesc> const& chunks,
                     size_t num_rows,
                     size_t min_row,
                     rmm::cuda_stream_view stream);
@@ -436,8 +436,8 @@ struct dremel_data {
  * @return A struct containing dremel data
  */
 dremel_data get_dremel_data(column_view h_col,
-                            rmm::device_uvector<uint8_t> const &d_nullability,
-                            std::vector<uint8_t> const &nullability,
+                            rmm::device_uvector<uint8_t> const& d_nullability,
+                            std::vector<uint8_t> const& nullability,
                             rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
 /**
@@ -486,8 +486,8 @@ void InitEncoderPages(cudf::detail::device_2dspan<EncColumnChunk> chunks,
                       device_span<gpu::EncPage> pages,
                       device_span<parquet_column_device_view const> col_desc,
                       int32_t num_columns,
-                      statistics_merge_group *page_grstats  = nullptr,
-                      statistics_merge_group *chunk_grstats = nullptr,
+                      statistics_merge_group* page_grstats  = nullptr,
+                      statistics_merge_group* chunk_grstats = nullptr,
                       rmm::cuda_stream_view stream          = rmm::cuda_stream_default);
 
 /**
@@ -524,7 +524,7 @@ void DecideCompression(device_span<EncColumnChunk> chunks,
 void EncodePageHeaders(device_span<EncPage> pages,
                        device_span<gpu_inflate_status_s const> comp_out = {},
                        device_span<statistics_chunk const> page_stats   = {},
-                       const statistics_chunk *chunk_stats              = nullptr,
+                       const statistics_chunk* chunk_stats              = nullptr,
                        rmm::cuda_stream_view stream                     = rmm::cuda_stream_default);
 
 /**
@@ -546,7 +546,7 @@ void GatherPages(device_span<EncColumnChunk> chunks,
  * @param[in] stream CUDA stream to use, default 0
  */
 void BuildChunkDictionaries(device_span<EncColumnChunk> chunks,
-                            uint32_t *dev_scratch,
+                            uint32_t* dev_scratch,
                             rmm::cuda_stream_view stream);
 
 }  // namespace gpu
diff --git a/cpp/src/io/parquet/reader_impl.cu b/cpp/src/io/parquet/reader_impl.cu
index 0863bca7b03..3bf11063035 100644
--- a/cpp/src/io/parquet/reader_impl.cu
+++ b/cpp/src/io/parquet/reader_impl.cu
@@ -51,7 +51,7 @@ constexpr uint32_t PARQUET_COLUMN_BUFFER_FLAG_LIST_TERMINATED = (1 << 24);
 
 namespace {
 
-parquet::ConvertedType logical_type_to_converted_type(parquet::LogicalType const &logical)
+parquet::ConvertedType logical_type_to_converted_type(parquet::LogicalType const& logical)
 {
   if (logical.isset.STRING) {
     return parquet::UTF8;
@@ -96,7 +96,7 @@ parquet::ConvertedType logical_type_to_converted_type(parquet::LogicalType const
 /**
  * @brief Function that translates Parquet datatype to cuDF type enum
  */
-type_id to_type_id(SchemaElement const &schema,
+type_id to_type_id(SchemaElement const& schema,
                    bool strings_to_categorical,
                    type_id timestamp_type_id,
                    bool strict_decimal_types)
@@ -232,7 +232,7 @@ std::tuple<int32_t, int32_t, int8_t> conversion_info(type_id column_type_id,
 
 }  // namespace
 
-std::string name_from_path(const std::vector<std::string> &path_in_schema)
+std::string name_from_path(const std::vector<std::string>& path_in_schema)
 {
   // For the case of lists, we will see a schema that looks like:
   // a.list.element.list.element
@@ -273,16 +273,16 @@ std::string name_from_path(const std::vector<std::string> &path_in_schema)
  * @brief Class for parsing dataset metadata
  */
 struct metadata : public FileMetaData {
-  explicit metadata(datasource *source)
+  explicit metadata(datasource* source)
   {
     constexpr auto header_len = sizeof(file_header_s);
     constexpr auto ender_len  = sizeof(file_ender_s);
 
     const auto len           = source->size();
     const auto header_buffer = source->host_read(0, header_len);
-    const auto header        = reinterpret_cast<const file_header_s *>(header_buffer->data());
+    const auto header        = reinterpret_cast<const file_header_s*>(header_buffer->data());
     const auto ender_buffer  = source->host_read(len - ender_len, ender_len);
-    const auto ender         = reinterpret_cast<const file_ender_s *>(ender_buffer->data());
+    const auto ender         = reinterpret_cast<const file_ender_s*>(ender_buffer->data());
     CUDF_EXPECTS(len > header_len + ender_len, "Incorrect data source");
     CUDF_EXPECTS(header->magic == parquet_magic && ender->magic == parquet_magic,
                  "Corrupted header or footer");
@@ -304,11 +304,11 @@ class aggregate_metadata {
   /**
    * @brief Create a metadata object from each element in the source vector
    */
-  auto metadatas_from_sources(std::vector<std::unique_ptr<datasource>> const &sources)
+  auto metadatas_from_sources(std::vector<std::unique_ptr<datasource>> const& sources)
   {
     std::vector<metadata> metadatas;
     std::transform(
-      sources.cbegin(), sources.cend(), std::back_inserter(metadatas), [](auto const &source) {
+      sources.cbegin(), sources.cend(), std::back_inserter(metadatas), [](auto const& source) {
         return metadata(source.get());
       });
     return metadatas;
@@ -321,8 +321,10 @@ class aggregate_metadata {
   {
     std::map<std::string, std::string> merged;
     // merge key/value maps TODO: warn/throw if there are mismatches?
-    for (auto const &pfm : per_file_metadata) {
-      for (auto const &kv : pfm.key_value_metadata) { merged[kv.key] = kv.value; }
+    for (auto const& pfm : per_file_metadata) {
+      for (auto const& kv : pfm.key_value_metadata) {
+        merged[kv.key] = kv.value;
+      }
     }
     return merged;
   }
@@ -333,7 +335,7 @@ class aggregate_metadata {
   size_type calc_num_rows() const
   {
     return std::accumulate(
-      per_file_metadata.begin(), per_file_metadata.end(), 0, [](auto &sum, auto &pfm) {
+      per_file_metadata.begin(), per_file_metadata.end(), 0, [](auto& sum, auto& pfm) {
         return sum + pfm.num_rows;
       });
   }
@@ -344,13 +346,13 @@ class aggregate_metadata {
   size_type calc_num_row_groups() const
   {
     return std::accumulate(
-      per_file_metadata.begin(), per_file_metadata.end(), 0, [](auto &sum, auto &pfm) {
+      per_file_metadata.begin(), per_file_metadata.end(), 0, [](auto& sum, auto& pfm) {
         return sum + pfm.row_groups.size();
       });
   }
 
  public:
-  aggregate_metadata(std::vector<std::unique_ptr<datasource>> const &sources)
+  aggregate_metadata(std::vector<std::unique_ptr<datasource>> const& sources)
     : per_file_metadata(metadatas_from_sources(sources)),
       agg_keyval_map(merge_keyval_metadata()),
       num_rows(calc_num_rows()),
@@ -358,7 +360,7 @@ class aggregate_metadata {
   {
     // Verify that the input files have matching numbers of columns
     size_type num_cols = -1;
-    for (auto const &pfm : per_file_metadata) {
+    for (auto const& pfm : per_file_metadata) {
       if (pfm.row_groups.size() != 0) {
         if (num_cols == -1)
           num_cols = pfm.row_groups[0].columns.size();
@@ -368,27 +370,27 @@ class aggregate_metadata {
       }
     }
     // Verify that the input files have matching schemas
-    for (auto const &pfm : per_file_metadata) {
+    for (auto const& pfm : per_file_metadata) {
       CUDF_EXPECTS(per_file_metadata[0].schema == pfm.schema,
                    "All sources must have the same schemas");
     }
   }
 
-  auto const &get_row_group(size_type row_group_index, size_type src_idx) const
+  auto const& get_row_group(size_type row_group_index, size_type src_idx) const
   {
     CUDF_EXPECTS(src_idx >= 0 && src_idx < static_cast<size_type>(per_file_metadata.size()),
                  "invalid source index");
     return per_file_metadata[src_idx].row_groups[row_group_index];
   }
 
-  auto const &get_column_metadata(size_type row_group_index,
+  auto const& get_column_metadata(size_type row_group_index,
                                   size_type src_idx,
                                   int schema_idx) const
   {
     auto col = std::find_if(
       per_file_metadata[src_idx].row_groups[row_group_index].columns.begin(),
       per_file_metadata[src_idx].row_groups[row_group_index].columns.end(),
-      [schema_idx](ColumnChunk const &col) { return col.schema_idx == schema_idx ? true : false; });
+      [schema_idx](ColumnChunk const& col) { return col.schema_idx == schema_idx ? true : false; });
     CUDF_EXPECTS(col != std::end(per_file_metadata[src_idx].row_groups[row_group_index].columns),
                  "Found no metadata for schema index");
     return col->meta_data;
@@ -398,9 +400,9 @@ class aggregate_metadata {
 
   auto get_num_row_groups() const { return num_row_groups; }
 
-  auto const &get_schema(int schema_idx) const { return per_file_metadata[0].schema[schema_idx]; }
+  auto const& get_schema(int schema_idx) const { return per_file_metadata[0].schema[schema_idx]; }
 
-  auto const &get_key_value_metadata() const { return agg_keyval_map; }
+  auto const& get_key_value_metadata() const { return agg_keyval_map; }
 
   /**
    * @brief Gets the concrete nesting depth of output cudf columns
@@ -411,7 +413,7 @@ class aggregate_metadata {
    */
   inline int get_output_nesting_depth(int schema_index) const
   {
-    auto &pfm = per_file_metadata[0];
+    auto& pfm = per_file_metadata[0];
     int depth = 0;
 
     // walk upwards, skipping repeated fields
@@ -462,7 +464,7 @@ class aggregate_metadata {
    *
    * @param names List of column names to load, where index column name(s) will be added
    */
-  void add_pandas_index_names(std::vector<std::string> &names) const
+  void add_pandas_index_names(std::vector<std::string>& names) const
   {
     auto str = get_pandas_index();
     if (str.length() != 0) {
@@ -499,9 +501,9 @@ class aggregate_metadata {
    *
    * @return List of row group indexes and its starting row
    */
-  auto select_row_groups(std::vector<std::vector<size_type>> const &row_groups,
-                         size_type &row_start,
-                         size_type &row_count) const
+  auto select_row_groups(std::vector<std::vector<size_type>> const& row_groups,
+                         size_type& row_start,
+                         size_type& row_count) const
   {
     if (!row_groups.empty()) {
       std::vector<row_group_info> selection;
@@ -510,7 +512,7 @@ class aggregate_metadata {
 
       row_count = 0;
       for (size_t src_idx = 0; src_idx < row_groups.size(); ++src_idx) {
-        for (auto const &rowgroup_idx : row_groups[src_idx]) {
+        for (auto const& rowgroup_idx : row_groups[src_idx]) {
           CUDF_EXPECTS(
             rowgroup_idx >= 0 &&
               rowgroup_idx < static_cast<size_type>(per_file_metadata[src_idx].row_groups.size()),
@@ -561,16 +563,16 @@ class aggregate_metadata {
    * @param[in] strict_decimal_types True if it is an error to load an unsupported decimal type
    *
    */
-  void build_column_info(int &schema_idx,
-                         std::vector<input_column_info> &input_columns,
-                         std::vector<column_buffer> &output_columns,
-                         std::deque<int> &nesting,
+  void build_column_info(int& schema_idx,
+                         std::vector<input_column_info>& input_columns,
+                         std::vector<column_buffer>& output_columns,
+                         std::deque<int>& nesting,
                          bool strings_to_categorical,
                          type_id timestamp_type_id,
                          bool strict_decimal_types) const
   {
     int start_schema_idx = schema_idx;
-    auto const &schema   = get_schema(schema_idx);
+    auto const& schema   = get_schema(schema_idx);
     schema_idx++;
 
     // if I am a stub, continue on
@@ -595,7 +597,7 @@ class aggregate_metadata {
                          ? data_type{col_type, numeric::scale_type{-schema.decimal_scale}}
                          : data_type{col_type};
     output_columns.emplace_back(dtype, schema.repetition_type == OPTIONAL ? true : false);
-    column_buffer &output_col = output_columns.back();
+    column_buffer& output_col = output_columns.back();
     output_col.name           = schema.name;
 
     // build each child
@@ -613,7 +615,7 @@ class aggregate_metadata {
     // data stored) so add me to the list.
     if (schema.num_children == 0) {
       input_columns.emplace_back(input_column_info{start_schema_idx, schema.name});
-      input_column_info &input_col = input_columns.back();
+      input_column_info& input_col = input_columns.back();
       std::copy(nesting.begin(), nesting.end(), std::back_inserter(input_col.nesting));
     }
 
@@ -631,13 +633,13 @@ class aggregate_metadata {
    * @return input column information, output column information, list of output column schema
    * indices
    */
-  auto select_columns(std::vector<std::string> const &use_names,
+  auto select_columns(std::vector<std::string> const& use_names,
                       bool include_index,
                       bool strings_to_categorical,
                       type_id timestamp_type_id,
                       bool strict_decimal_types) const
   {
-    auto const &pfm = per_file_metadata[0];
+    auto const& pfm = per_file_metadata[0];
 
     // determine the list of output columns
     //
@@ -659,16 +661,16 @@ class aggregate_metadata {
     if (use_names.empty()) {
       // walk the schema and choose all top level columns
       for (size_t schema_idx = 1; schema_idx < pfm.schema.size(); schema_idx++) {
-        auto const &schema = pfm.schema[schema_idx];
+        auto const& schema = pfm.schema[schema_idx];
         if (schema.parent_idx == 0) { output_column_schemas.push_back(schema_idx); }
       }
     } else {
       // Load subset of columns; include PANDAS index unless excluded
       std::vector<std::string> local_use_names = use_names;
       if (include_index) { add_pandas_index_names(local_use_names); }
-      for (const auto &use_name : local_use_names) {
+      for (const auto& use_name : local_use_names) {
         for (size_t schema_idx = 1; schema_idx < pfm.schema.size(); schema_idx++) {
-          auto const &schema = pfm.schema[schema_idx];
+          auto const& schema = pfm.schema[schema_idx];
           // We select only top level columns by name. Selecting nested columns by name is not
           // supported. Top level columns are identified by their parent being the root (idx == 0)
           if (use_name == schema.name and schema.parent_idx == 0) {
@@ -711,9 +713,9 @@ class aggregate_metadata {
  * @param src_col_schema The column schema to generate the new mapping for
  * @param md File metadata information
  */
-void generate_depth_remappings(std::map<int, std::pair<std::vector<int>, std::vector<int>>> &remap,
+void generate_depth_remappings(std::map<int, std::pair<std::vector<int>, std::vector<int>>>& remap,
                                int src_col_schema,
-                               aggregate_metadata const &md)
+                               aggregate_metadata const& md)
 {
   // already generated for this level
   if (remap.find(src_col_schema) != remap.end()) { return; }
@@ -724,11 +726,11 @@ void generate_depth_remappings(std::map<int, std::pair<std::vector<int>, std::ve
                "Attempting to remap a schema more than once");
   auto inserted =
     remap.insert(std::pair<int, std::pair<std::vector<int>, std::vector<int>>>{src_col_schema, {}});
-  auto &depth_remap = inserted.first->second;
+  auto& depth_remap = inserted.first->second;
 
-  std::vector<int> &rep_depth_remap = (depth_remap.first);
+  std::vector<int>& rep_depth_remap = (depth_remap.first);
   rep_depth_remap.resize(schema.max_repetition_level + 1);
-  std::vector<int> &def_depth_remap = (depth_remap.second);
+  std::vector<int>& def_depth_remap = (depth_remap.second);
   def_depth_remap.resize(schema.max_definition_level + 1);
 
   // the key:
@@ -822,12 +824,12 @@ void generate_depth_remappings(std::map<int, std::pair<std::vector<int>, std::ve
  * @copydoc cudf::io::detail::parquet::read_column_chunks
  */
 void reader::impl::read_column_chunks(
-  std::vector<std::unique_ptr<datasource::buffer>> &page_data,
-  hostdevice_vector<gpu::ColumnChunkDesc> &chunks,  // TODO const?
+  std::vector<std::unique_ptr<datasource::buffer>>& page_data,
+  hostdevice_vector<gpu::ColumnChunkDesc>& chunks,  // TODO const?
   size_t begin_chunk,
   size_t end_chunk,
-  const std::vector<size_t> &column_chunk_offsets,
-  std::vector<size_type> const &chunk_source_map,
+  const std::vector<size_t>& column_chunk_offsets,
+  std::vector<size_type> const& chunk_source_map,
   rmm::cuda_stream_view stream)
 {
   // Transfer chunk data, coalescing adjacent chunks
@@ -850,7 +852,7 @@ void reader::impl::read_column_chunks(
       next_chunk++;
     }
     if (io_size != 0) {
-      auto &source = _sources[chunk_source_map[chunk]];
+      auto& source = _sources[chunk_source_map[chunk]];
       if (source->is_device_read_preferred(io_size)) {
         page_data[chunk] = source->device_read(io_offset, io_size, stream);
       } else {
@@ -872,7 +874,7 @@ void reader::impl::read_column_chunks(
 /**
  * @copydoc cudf::io::detail::parquet::count_page_headers
  */
-size_t reader::impl::count_page_headers(hostdevice_vector<gpu::ColumnChunkDesc> &chunks,
+size_t reader::impl::count_page_headers(hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
                                         rmm::cuda_stream_view stream)
 {
   size_t total_pages = 0;
@@ -891,8 +893,8 @@ size_t reader::impl::count_page_headers(hostdevice_vector<gpu::ColumnChunkDesc>
 /**
  * @copydoc cudf::io::detail::parquet::decode_page_headers
  */
-void reader::impl::decode_page_headers(hostdevice_vector<gpu::ColumnChunkDesc> &chunks,
-                                       hostdevice_vector<gpu::PageInfo> &pages,
+void reader::impl::decode_page_headers(hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
+                                       hostdevice_vector<gpu::PageInfo>& pages,
                                        rmm::cuda_stream_view stream)
 {
   // IMPORTANT : if you change how pages are stored within a chunk (dist pages, then data pages),
@@ -912,15 +914,17 @@ void reader::impl::decode_page_headers(hostdevice_vector<gpu::ColumnChunkDesc> &
  * @copydoc cudf::io::detail::parquet::decompress_page_data
  */
 rmm::device_buffer reader::impl::decompress_page_data(
-  hostdevice_vector<gpu::ColumnChunkDesc> &chunks,
-  hostdevice_vector<gpu::PageInfo> &pages,
+  hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
+  hostdevice_vector<gpu::PageInfo>& pages,
   rmm::cuda_stream_view stream)
 {
-  auto for_each_codec_page = [&](parquet::Compression codec, const std::function<void(size_t)> &f) {
+  auto for_each_codec_page = [&](parquet::Compression codec, const std::function<void(size_t)>& f) {
     for (size_t c = 0, page_count = 0; c < chunks.size(); c++) {
       const auto page_stride = chunks[c].max_num_pages;
       if (chunks[c].codec == codec) {
-        for (int k = 0; k < page_stride; k++) { f(page_count + k); }
+        for (int k = 0; k < page_stride; k++) {
+          f(page_count + k);
+        }
       }
       page_count += page_stride;
     }
@@ -936,7 +940,7 @@ rmm::device_buffer reader::impl::decompress_page_data(
                                                                 std::make_pair(parquet::SNAPPY, 0),
                                                                 std::make_pair(parquet::BROTLI, 0)};
 
-  for (auto &codec : codecs) {
+  for (auto& codec : codecs) {
     for_each_codec_page(codec.first, [&](size_t page) {
       total_decomp_size += pages[page].uncompressed_page_size;
       codec.second++;
@@ -954,12 +958,12 @@ rmm::device_buffer reader::impl::decompress_page_data(
 
   size_t decomp_offset = 0;
   int32_t argc         = 0;
-  for (const auto &codec : codecs) {
+  for (const auto& codec : codecs) {
     if (codec.second > 0) {
       int32_t start_pos = argc;
 
       for_each_codec_page(codec.first, [&](size_t page) {
-        auto dst_base              = static_cast<uint8_t *>(decomp_pages.data());
+        auto dst_base              = static_cast<uint8_t*>(decomp_pages.data());
         inflate_in[argc].srcDevice = pages[page].page_data;
         inflate_in[argc].srcSize   = pages[page].compressed_page_size;
         inflate_in[argc].dstDevice = dst_base + decomp_offset;
@@ -969,7 +973,7 @@ rmm::device_buffer reader::impl::decompress_page_data(
         inflate_out[argc].status        = static_cast<uint32_t>(-1000);
         inflate_out[argc].reserved      = 0;
 
-        pages[page].page_data = static_cast<uint8_t *>(inflate_in[argc].dstDevice);
+        pages[page].page_data = static_cast<uint8_t*>(inflate_in[argc].dstDevice);
         decomp_offset += inflate_in[argc].dstSize;
         argc++;
       });
@@ -1027,17 +1031,17 @@ rmm::device_buffer reader::impl::decompress_page_data(
 /**
  * @copydoc cudf::io::detail::parquet::allocate_nesting_info
  */
-void reader::impl::allocate_nesting_info(hostdevice_vector<gpu::ColumnChunkDesc> const &chunks,
-                                         hostdevice_vector<gpu::PageInfo> &pages,
-                                         hostdevice_vector<gpu::PageNestingInfo> &page_nesting_info,
+void reader::impl::allocate_nesting_info(hostdevice_vector<gpu::ColumnChunkDesc> const& chunks,
+                                         hostdevice_vector<gpu::PageInfo>& pages,
+                                         hostdevice_vector<gpu::PageNestingInfo>& page_nesting_info,
                                          rmm::cuda_stream_view stream)
 {
   // compute total # of page_nesting infos needed and allocate space. doing this in one
   // buffer to keep it to a single gpu allocation
   size_t const total_page_nesting_infos = std::accumulate(
-    chunks.host_ptr(), chunks.host_ptr() + chunks.size(), 0, [&](int total, auto &chunk) {
+    chunks.host_ptr(), chunks.host_ptr() + chunks.size(), 0, [&](int total, auto& chunk) {
       // the schema of the input column
-      auto const &schema                    = _metadata->get_schema(chunk.src_col_schema);
+      auto const& schema                    = _metadata->get_schema(chunk.src_col_schema);
       auto const per_page_nesting_info_size = max(
         schema.max_definition_level + 1, _metadata->get_output_nesting_depth(chunk.src_col_schema));
       return total + (per_page_nesting_info_size * chunk.num_data_pages);
@@ -1053,7 +1057,7 @@ void reader::impl::allocate_nesting_info(hostdevice_vector<gpu::ColumnChunkDesc>
   int src_info_index    = 0;
   for (size_t idx = 0; idx < chunks.size(); idx++) {
     int src_col_schema                    = chunks[idx].src_col_schema;
-    auto &schema                          = _metadata->get_schema(src_col_schema);
+    auto& schema                          = _metadata->get_schema(src_col_schema);
     auto const per_page_nesting_info_size = std::max(
       schema.max_definition_level + 1, _metadata->get_output_nesting_depth(src_col_schema));
 
@@ -1078,7 +1082,7 @@ void reader::impl::allocate_nesting_info(hostdevice_vector<gpu::ColumnChunkDesc>
     int src_col_schema = chunks[idx].src_col_schema;
 
     // schema of the input column
-    auto &schema = _metadata->get_schema(src_col_schema);
+    auto& schema = _metadata->get_schema(src_col_schema);
     // real depth of the output cudf column hierarchy (1 == no nesting, 2 == 1 level, etc)
     int max_depth = _metadata->get_output_nesting_depth(src_col_schema);
 
@@ -1101,7 +1105,7 @@ void reader::impl::allocate_nesting_info(hostdevice_vector<gpu::ColumnChunkDesc>
       if (!cur_schema.is_stub()) {
         // initialize each page within the chunk
         for (int p_idx = 0; p_idx < chunks[idx].num_data_pages; p_idx++) {
-          gpu::PageNestingInfo *pni =
+          gpu::PageNestingInfo* pni =
             &page_nesting_info[nesting_info_index + (p_idx * per_page_nesting_info_size)];
 
           // if we have lists, set our start and end depth remappings
@@ -1109,8 +1113,8 @@ void reader::impl::allocate_nesting_info(hostdevice_vector<gpu::ColumnChunkDesc>
             auto remap = depth_remapping.find(src_col_schema);
             CUDF_EXPECTS(remap != depth_remapping.end(),
                          "Could not find depth remapping for schema");
-            std::vector<int> const &rep_depth_remap = (remap->second.first);
-            std::vector<int> const &def_depth_remap = (remap->second.second);
+            std::vector<int> const& rep_depth_remap = (remap->second.first);
+            std::vector<int> const& def_depth_remap = (remap->second.second);
 
             for (size_t m = 0; m < rep_depth_remap.size(); m++) {
               pni[m].start_depth = rep_depth_remap[m];
@@ -1145,8 +1149,8 @@ void reader::impl::allocate_nesting_info(hostdevice_vector<gpu::ColumnChunkDesc>
 /**
  * @copydoc cudf::io::detail::parquet::preprocess_columns
  */
-void reader::impl::preprocess_columns(hostdevice_vector<gpu::ColumnChunkDesc> &chunks,
-                                      hostdevice_vector<gpu::PageInfo> &pages,
+void reader::impl::preprocess_columns(hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
+                                      hostdevice_vector<gpu::PageInfo>& pages,
                                       size_t min_row,
                                       size_t total_rows,
                                       bool has_lists,
@@ -1158,10 +1162,10 @@ void reader::impl::preprocess_columns(hostdevice_vector<gpu::ColumnChunkDesc> &c
   // if there are no lists, simply allocate every allocate every output
   // column to be of size num_rows
   if (!has_lists) {
-    std::function<void(std::vector<column_buffer> &)> create_columns =
-      [&](std::vector<column_buffer> &cols) {
+    std::function<void(std::vector<column_buffer>&)> create_columns =
+      [&](std::vector<column_buffer>& cols) {
         for (size_t idx = 0; idx < cols.size(); idx++) {
-          auto &col = cols[idx];
+          auto& col = cols[idx];
           col.create(total_rows, stream, _mr);
           create_columns(col.children);
         }
@@ -1178,14 +1182,14 @@ void reader::impl::preprocess_columns(hostdevice_vector<gpu::ColumnChunkDesc> &c
 /**
  * @copydoc cudf::io::detail::parquet::decode_page_data
  */
-void reader::impl::decode_page_data(hostdevice_vector<gpu::ColumnChunkDesc> &chunks,
-                                    hostdevice_vector<gpu::PageInfo> &pages,
-                                    hostdevice_vector<gpu::PageNestingInfo> &page_nesting,
+void reader::impl::decode_page_data(hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
+                                    hostdevice_vector<gpu::PageInfo>& pages,
+                                    hostdevice_vector<gpu::PageNestingInfo>& page_nesting,
                                     size_t min_row,
                                     size_t total_rows,
                                     rmm::cuda_stream_view stream)
 {
-  auto is_dict_chunk = [](const gpu::ColumnChunkDesc &chunk) {
+  auto is_dict_chunk = [](const gpu::ColumnChunkDesc& chunk) {
     return (chunk.data_type & 0x7) == BYTE_ARRAY && chunk.num_dict_pages > 0;
   };
 
@@ -1207,20 +1211,20 @@ void reader::impl::decode_page_data(hostdevice_vector<gpu::ColumnChunkDesc> &chu
     std::accumulate(chunks.host_ptr(),
                     chunks.host_ptr(chunks.size()),
                     0,
-                    [&](size_t cursum, gpu::ColumnChunkDesc const &chunk) {
+                    [&](size_t cursum, gpu::ColumnChunkDesc const& chunk) {
                       return cursum + _metadata->get_output_nesting_depth(chunk.src_col_schema);
                     });
 
   // In order to reduce the number of allocations of hostdevice_vector, we allocate a single vector
   // to store all per-chunk pointers to nested data/nullmask. `chunk_offsets[i]` will store the
   // offset into `chunk_nested_data`/`chunk_nested_valids` for the array of pointers for chunk `i`
-  auto chunk_nested_valids = hostdevice_vector<uint32_t *>(sum_max_depths);
-  auto chunk_nested_data   = hostdevice_vector<void *>(sum_max_depths);
+  auto chunk_nested_valids = hostdevice_vector<uint32_t*>(sum_max_depths);
+  auto chunk_nested_data   = hostdevice_vector<void*>(sum_max_depths);
   auto chunk_offsets       = std::vector<size_t>();
 
   // Update chunks with pointers to column data.
   for (size_t c = 0, page_count = 0, str_ofs = 0, chunk_off = 0; c < chunks.size(); c++) {
-    input_column_info const &input_col = _input_columns[chunks[c].src_col_index];
+    input_column_info const& input_col = _input_columns[chunks[c].src_col_index];
     CUDF_EXPECTS(input_col.schema_idx == chunks[c].src_col_schema,
                  "Column/page schema index mismatch");
 
@@ -1275,9 +1279,9 @@ void reader::impl::decode_page_data(hostdevice_vector<gpu::ColumnChunkDesc> &chu
     //
     // we do this by only handing out the pointers to the first child we come across.
     //
-    auto *cols = &_output_columns;
+    auto* cols = &_output_columns;
     for (size_t idx = 0; idx < max_depth; idx++) {
-      auto &out_buf = (*cols)[input_col.nesting[idx]];
+      auto& out_buf = (*cols)[input_col.nesting[idx]];
       cols          = &out_buf.children;
 
       int owning_schema = out_buf.user_data & PARQUET_COLUMN_BUFFER_SCHEMA_MASK;
@@ -1317,11 +1321,11 @@ void reader::impl::decode_page_data(hostdevice_vector<gpu::ColumnChunkDesc> &chu
   // last value that should then be followed by a terminator (because rows can span
   // page boundaries).
   for (size_t idx = 0; idx < _input_columns.size(); idx++) {
-    input_column_info const &input_col = _input_columns[idx];
+    input_column_info const& input_col = _input_columns[idx];
 
-    auto *cols = &_output_columns;
+    auto* cols = &_output_columns;
     for (size_t l_idx = 0; l_idx < input_col.nesting_depth(); l_idx++) {
-      auto &out_buf = (*cols)[input_col.nesting[l_idx]];
+      auto& out_buf = (*cols)[input_col.nesting[l_idx]];
       cols          = &out_buf.children;
 
       if (out_buf.type.id() != type_id::LIST ||
@@ -1329,11 +1333,11 @@ void reader::impl::decode_page_data(hostdevice_vector<gpu::ColumnChunkDesc> &chu
         continue;
       }
       CUDF_EXPECTS(l_idx < input_col.nesting_depth() - 1, "Encountered a leaf list column");
-      auto &child = (*cols)[input_col.nesting[l_idx + 1]];
+      auto& child = (*cols)[input_col.nesting[l_idx + 1]];
 
       // the final offset for a list at level N is the size of it's child
       int offset = child.type.id() == type_id::LIST ? child.size - 1 : child.size;
-      cudaMemcpyAsync(static_cast<int32_t *>(out_buf.data()) + (out_buf.size - 1),
+      cudaMemcpyAsync(static_cast<int32_t*>(out_buf.data()) + (out_buf.size - 1),
                       &offset,
                       sizeof(offset),
                       cudaMemcpyHostToDevice,
@@ -1344,17 +1348,17 @@ void reader::impl::decode_page_data(hostdevice_vector<gpu::ColumnChunkDesc> &chu
 
   // update null counts in the final column buffers
   for (size_t idx = 0; idx < pages.size(); idx++) {
-    gpu::PageInfo *pi = &pages[idx];
+    gpu::PageInfo* pi = &pages[idx];
     if (pi->flags & gpu::PAGEINFO_FLAGS_DICTIONARY) { continue; }
-    gpu::ColumnChunkDesc *col          = &chunks[pi->chunk_idx];
-    input_column_info const &input_col = _input_columns[col->src_col_index];
+    gpu::ColumnChunkDesc* col          = &chunks[pi->chunk_idx];
+    input_column_info const& input_col = _input_columns[col->src_col_index];
 
     int index                 = pi->nesting - page_nesting.device_ptr();
-    gpu::PageNestingInfo *pni = &page_nesting[index];
+    gpu::PageNestingInfo* pni = &page_nesting[index];
 
-    auto *cols = &_output_columns;
+    auto* cols = &_output_columns;
     for (size_t l_idx = 0; l_idx < input_col.nesting_depth(); l_idx++) {
-      auto &out_buf = (*cols)[input_col.nesting[l_idx]];
+      auto& out_buf = (*cols)[input_col.nesting[l_idx]];
       cols          = &out_buf.children;
 
       // if I wasn't the one who wrote out the validity bits, skip it
@@ -1368,9 +1372,9 @@ void reader::impl::decode_page_data(hostdevice_vector<gpu::ColumnChunkDesc> &chu
   stream.synchronize();
 }
 
-reader::impl::impl(std::vector<std::unique_ptr<datasource>> &&sources,
-                   parquet_reader_options const &options,
-                   rmm::mr::device_memory_resource *mr)
+reader::impl::impl(std::vector<std::unique_ptr<datasource>>&& sources,
+                   parquet_reader_options const& options,
+                   rmm::mr::device_memory_resource* mr)
   : _mr(mr), _sources(std::move(sources))
 {
   // Open and parse the source dataset metadata
@@ -1397,7 +1401,7 @@ reader::impl::impl(std::vector<std::unique_ptr<datasource>> &&sources,
 
 table_with_metadata reader::impl::read(size_type skip_rows,
                                        size_type num_rows,
-                                       std::vector<std::vector<size_type>> const &row_group_list,
+                                       std::vector<std::vector<size_type>> const& row_group_list,
                                        rmm::cuda_stream_view stream)
 {
   // Select only row groups required
@@ -1431,8 +1435,8 @@ table_with_metadata reader::impl::read(size_type skip_rows,
     // Initialize column chunk information
     size_t total_decompressed_size = 0;
     auto remaining_rows            = num_rows;
-    for (const auto &rg : selected_row_groups) {
-      const auto &row_group       = _metadata->get_row_group(rg.index, rg.source_index);
+    for (const auto& rg : selected_row_groups) {
+      const auto& row_group       = _metadata->get_row_group(rg.index, rg.source_index);
       auto const row_group_start  = rg.start_row;
       auto const row_group_source = rg.source_index;
       auto const row_group_rows   = std::min<int>(remaining_rows, row_group.num_rows);
@@ -1442,8 +1446,8 @@ table_with_metadata reader::impl::read(size_type skip_rows,
       for (size_t i = 0; i < num_input_columns; ++i) {
         auto col = _input_columns[i];
         // look up metadata
-        auto &col_meta = _metadata->get_column_metadata(rg.index, rg.source_index, col.schema_idx);
-        auto &schema   = _metadata->get_schema(col.schema_idx);
+        auto& col_meta = _metadata->get_column_metadata(rg.index, rg.source_index, col.schema_idx);
+        auto& schema   = _metadata->get_schema(col.schema_idx);
 
         // this column contains repetition levels and will require a preprocess
         if (schema.max_repetition_level > 0) { has_lists = true; }
@@ -1579,7 +1583,7 @@ table_with_metadata reader::impl::read(size_type skip_rows,
   // Return column names (must match order of returned columns)
   out_metadata.column_names.resize(_output_columns.size());
   for (size_t i = 0; i < _output_column_schemas.size(); i++) {
-    auto const &schema           = _metadata->get_schema(_output_column_schemas[i]);
+    auto const& schema           = _metadata->get_schema(_output_column_schemas[i]);
     out_metadata.column_names[i] = schema.name;
   }
 
@@ -1590,19 +1594,19 @@ table_with_metadata reader::impl::read(size_type skip_rows,
 }
 
 // Forward to implementation
-reader::reader(std::vector<std::string> const &filepaths,
-               parquet_reader_options const &options,
+reader::reader(std::vector<std::string> const& filepaths,
+               parquet_reader_options const& options,
                rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource *mr)
+               rmm::mr::device_memory_resource* mr)
   : _impl(std::make_unique<impl>(datasource::create(filepaths), options, mr))
 {
 }
 
 // Forward to implementation
-reader::reader(std::vector<std::unique_ptr<cudf::io::datasource>> &&sources,
-               parquet_reader_options const &options,
+reader::reader(std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
+               parquet_reader_options const& options,
                rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource *mr)
+               rmm::mr::device_memory_resource* mr)
   : _impl(std::make_unique<impl>(std::move(sources), options, mr))
 {
 }
@@ -1611,7 +1615,7 @@ reader::reader(std::vector<std::unique_ptr<cudf::io::datasource>> &&sources,
 reader::~reader() = default;
 
 // Forward to implementation
-table_with_metadata reader::read(parquet_reader_options const &options,
+table_with_metadata reader::read(parquet_reader_options const& options,
                                  rmm::cuda_stream_view stream)
 {
   return _impl->read(
diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp
index ffd8975a8d2..b93107aa9b2 100644
--- a/cpp/src/io/parquet/reader_impl.hpp
+++ b/cpp/src/io/parquet/reader_impl.hpp
@@ -60,9 +60,9 @@ class reader::impl {
    * @param options Settings for controlling reading behavior
    * @param mr Device memory resource to use for device memory allocation
    */
-  explicit impl(std::vector<std::unique_ptr<datasource>> &&sources,
-                parquet_reader_options const &options,
-                rmm::mr::device_memory_resource *mr);
+  explicit impl(std::vector<std::unique_ptr<datasource>>&& sources,
+                parquet_reader_options const& options,
+                rmm::mr::device_memory_resource* mr);
 
   /**
    * @brief Read an entire set or a subset of data and returns a set of columns
@@ -76,7 +76,7 @@ class reader::impl {
    */
   table_with_metadata read(size_type skip_rows,
                            size_type num_rows,
-                           std::vector<std::vector<size_type>> const &row_group_indices,
+                           std::vector<std::vector<size_type>> const& row_group_indices,
                            rmm::cuda_stream_view stream);
 
  private:
@@ -91,12 +91,12 @@ class reader::impl {
    * @param stream CUDA stream used for device memory operations and kernel launches.
    *
    */
-  void read_column_chunks(std::vector<std::unique_ptr<datasource::buffer>> &page_data,
-                          hostdevice_vector<gpu::ColumnChunkDesc> &chunks,
+  void read_column_chunks(std::vector<std::unique_ptr<datasource::buffer>>& page_data,
+                          hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
                           size_t begin_chunk,
                           size_t end_chunk,
-                          const std::vector<size_t> &column_chunk_offsets,
-                          std::vector<size_type> const &chunk_source_map,
+                          const std::vector<size_t>& column_chunk_offsets,
+                          std::vector<size_type> const& chunk_source_map,
                           rmm::cuda_stream_view stream);
 
   /**
@@ -107,7 +107,7 @@ class reader::impl {
    *
    * @return The total number of pages
    */
-  size_t count_page_headers(hostdevice_vector<gpu::ColumnChunkDesc> &chunks,
+  size_t count_page_headers(hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
                             rmm::cuda_stream_view stream);
 
   /**
@@ -117,8 +117,8 @@ class reader::impl {
    * @param pages List of page information
    * @param stream CUDA stream used for device memory operations and kernel launches.
    */
-  void decode_page_headers(hostdevice_vector<gpu::ColumnChunkDesc> &chunks,
-                           hostdevice_vector<gpu::PageInfo> &pages,
+  void decode_page_headers(hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
+                           hostdevice_vector<gpu::PageInfo>& pages,
                            rmm::cuda_stream_view stream);
 
   /**
@@ -130,8 +130,8 @@ class reader::impl {
    *
    * @return Device buffer to decompressed page data
    */
-  rmm::device_buffer decompress_page_data(hostdevice_vector<gpu::ColumnChunkDesc> &chunks,
-                                          hostdevice_vector<gpu::PageInfo> &pages,
+  rmm::device_buffer decompress_page_data(hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
+                                          hostdevice_vector<gpu::PageInfo>& pages,
                                           rmm::cuda_stream_view stream);
 
   /**
@@ -149,9 +149,9 @@ class reader::impl {
    * @param page_nesting_info The allocated nesting info structs.
    * @param stream CUDA stream used for device memory operations and kernel launches.
    */
-  void allocate_nesting_info(hostdevice_vector<gpu::ColumnChunkDesc> const &chunks,
-                             hostdevice_vector<gpu::PageInfo> &pages,
-                             hostdevice_vector<gpu::PageNestingInfo> &page_nesting_info,
+  void allocate_nesting_info(hostdevice_vector<gpu::ColumnChunkDesc> const& chunks,
+                             hostdevice_vector<gpu::PageInfo>& pages,
+                             hostdevice_vector<gpu::PageNestingInfo>& page_nesting_info,
                              rmm::cuda_stream_view stream);
 
   /**
@@ -172,8 +172,8 @@ class reader::impl {
    * a preprocess.
    * @param[in] stream Cuda stream
    */
-  void preprocess_columns(hostdevice_vector<gpu::ColumnChunkDesc> &chunks,
-                          hostdevice_vector<gpu::PageInfo> &pages,
+  void preprocess_columns(hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
+                          hostdevice_vector<gpu::PageInfo>& pages,
                           size_t min_row,
                           size_t total_rows,
                           bool has_lists,
@@ -189,15 +189,15 @@ class reader::impl {
    * @param total_rows Number of rows to output
    * @param stream CUDA stream used for device memory operations and kernel launches.
    */
-  void decode_page_data(hostdevice_vector<gpu::ColumnChunkDesc> &chunks,
-                        hostdevice_vector<gpu::PageInfo> &pages,
-                        hostdevice_vector<gpu::PageNestingInfo> &page_nesting,
+  void decode_page_data(hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
+                        hostdevice_vector<gpu::PageInfo>& pages,
+                        hostdevice_vector<gpu::PageNestingInfo>& page_nesting,
                         size_t min_row,
                         size_t total_rows,
                         rmm::cuda_stream_view stream);
 
  private:
-  rmm::mr::device_memory_resource *_mr = nullptr;
+  rmm::mr::device_memory_resource* _mr = nullptr;
   std::vector<std::unique_ptr<datasource>> _sources;
   std::unique_ptr<aggregate_metadata> _metadata;
 
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 77210b5a2ab..73924512bce 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -87,14 +87,14 @@ struct linked_column_view : public column_view {
   //       copy of this object. Options:
   // 1. Inherit from column_view_base. Only lose out on children vector. That is not needed.
   // 2. Don't inherit at all. make linked_column_view keep a reference wrapper to its column_view
-  linked_column_view(column_view const &col) : column_view(col), parent(nullptr)
+  linked_column_view(column_view const& col) : column_view(col), parent(nullptr)
   {
     for (auto child_it = col.child_begin(); child_it < col.child_end(); ++child_it) {
       children.push_back(std::make_shared<linked_column_view>(this, *child_it));
     }
   }
 
-  linked_column_view(linked_column_view *parent, column_view const &col)
+  linked_column_view(linked_column_view* parent, column_view const& col)
     : column_view(col), parent(parent)
   {
     for (auto child_it = col.child_begin(); child_it < col.child_end(); ++child_it) {
@@ -102,7 +102,7 @@ struct linked_column_view : public column_view {
     }
   }
 
-  linked_column_view *parent;  //!< Pointer to parent of this column. Nullptr if root
+  linked_column_view* parent;  //!< Pointer to parent of this column. Nullptr if root
   LinkedColVector children;
 };
 
@@ -112,10 +112,10 @@ struct linked_column_view : public column_view {
  * @param table table of columns to convert
  * @return Vector of converted linked_column_views
  */
-LinkedColVector input_table_to_linked_columns(table_view const &table)
+LinkedColVector input_table_to_linked_columns(table_view const& table)
 {
   LinkedColVector result;
-  for (column_view const &col : table) {
+  for (column_view const& col : table) {
     result.emplace_back(std::make_shared<linked_column_view>(col));
   }
 
@@ -144,9 +144,9 @@ struct schema_tree_node : public SchemaElement {
 };
 
 struct leaf_schema_fn {
-  schema_tree_node &col_schema;
-  LinkedColPtr const &col;
-  column_in_metadata const &col_meta;
+  schema_tree_node& col_schema;
+  LinkedColPtr const& col;
+  column_in_metadata const& col_meta;
   bool timestamp_is_int96;
 
   template <typename T>
@@ -370,8 +370,8 @@ struct leaf_schema_fn {
  * Recursively traverses through linked_columns and corresponding metadata to construct schema tree.
  * The resulting schema tree is stored in a vector in pre-order traversal order.
  */
-std::vector<schema_tree_node> construct_schema_tree(LinkedColVector const &linked_columns,
-                                                    table_input_metadata const &metadata,
+std::vector<schema_tree_node> construct_schema_tree(LinkedColVector const& linked_columns,
+                                                    table_input_metadata const& metadata,
                                                     bool single_write_mode,
                                                     bool int96_timestamps)
 {
@@ -384,8 +384,8 @@ std::vector<schema_tree_node> construct_schema_tree(LinkedColVector const &linke
   root.parent_idx      = -1;  // root schema has no parent
   schema.push_back(std::move(root));
 
-  std::function<void(LinkedColPtr const &, column_in_metadata const &, size_t)> add_schema =
-    [&](LinkedColPtr const &col, column_in_metadata const &col_meta, size_t parent_idx) {
+  std::function<void(LinkedColPtr const&, column_in_metadata const&, size_t)> add_schema =
+    [&](LinkedColPtr const& col, column_in_metadata const& col_meta, size_t parent_idx) {
       bool col_nullable = [&]() {
         if (single_write_mode) {
           return col->nullable();
@@ -500,8 +500,8 @@ std::vector<schema_tree_node> construct_schema_tree(LinkedColVector const &linke
  *
  */
 struct parquet_column_view {
-  parquet_column_view(schema_tree_node const &schema_node,
-                      std::vector<schema_tree_node> const &schema_tree,
+  parquet_column_view(schema_tree_node const& schema_node,
+                      std::vector<schema_tree_node> const& schema_tree,
                       rmm::cuda_stream_view stream);
 
   column_view leaf_column_view() const;
@@ -510,7 +510,7 @@ struct parquet_column_view {
   column_view cudf_column_view() const { return cudf_col; }
   parquet::Type physical_type() const { return schema_node.type; }
 
-  std::vector<std::string> const &get_path_in_schema() { return path_in_schema; }
+  std::vector<std::string> const& get_path_in_schema() { return path_in_schema; }
 
   // LIST related member functions
   uint8_t max_def_level() const noexcept { return _max_def_level; }
@@ -518,8 +518,8 @@ struct parquet_column_view {
   bool is_list() const noexcept { return _is_list; }
 
   // Dictionary related member functions
-  uint32_t *get_dict_data() { return (_dict_data.size()) ? _dict_data.data() : nullptr; }
-  uint32_t *get_dict_index() { return (_dict_index.size()) ? _dict_index.data() : nullptr; }
+  uint32_t* get_dict_data() { return (_dict_data.size()) ? _dict_data.data() : nullptr; }
+  uint32_t* get_dict_index() { return (_dict_index.size()) ? _dict_index.data() : nullptr; }
   void use_dictionary(bool use_dict) { _dictionary_used = use_dict; }
   void alloc_dictionary(size_t max_num_rows, rmm::cuda_stream_view stream)
   {
@@ -563,8 +563,8 @@ struct parquet_column_view {
   rmm::device_uvector<uint32_t> _dict_index;
 };
 
-parquet_column_view::parquet_column_view(schema_tree_node const &schema_node,
-                                         std::vector<schema_tree_node> const &schema_tree,
+parquet_column_view::parquet_column_view(schema_tree_node const& schema_node,
+                                         std::vector<schema_tree_node> const& schema_tree,
                                          rmm::cuda_stream_view stream)
   : schema_node(schema_node),
     _d_nullability(0, stream),
@@ -578,7 +578,7 @@ parquet_column_view::parquet_column_view(schema_tree_node const &schema_node,
   auto curr_col                           = schema_node.leaf_column.get();
   column_view single_inheritance_cudf_col = *curr_col;
   while (curr_col->parent) {
-    auto const &parent = *curr_col->parent;
+    auto const& parent = *curr_col->parent;
 
     // For list columns, we still need to retain the offset child column.
     auto children =
@@ -718,7 +718,7 @@ gpu::parquet_column_device_view parquet_column_view::get_device_view(rmm::cuda_s
   return desc;
 }
 
-void writer::impl::init_page_fragments(cudf::detail::hostdevice_2dvector<gpu::PageFragment> &frag,
+void writer::impl::init_page_fragments(cudf::detail::hostdevice_2dvector<gpu::PageFragment>& frag,
                                        device_span<gpu::parquet_column_device_view const> col_desc,
                                        uint32_t num_rows,
                                        uint32_t fragment_size)
@@ -745,7 +745,7 @@ void writer::impl::gather_fragment_statistics(
 }
 
 void writer::impl::build_chunk_dictionaries(
-  hostdevice_2dvector<gpu::EncColumnChunk> &chunks,
+  hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
   device_span<gpu::parquet_column_device_view const> col_desc,
   uint32_t num_columns,
   uint32_t num_dictionaries)
@@ -762,11 +762,11 @@ void writer::impl::build_chunk_dictionaries(
   chunks.device_to_host(stream, true);
 }
 
-void writer::impl::init_encoder_pages(hostdevice_2dvector<gpu::EncColumnChunk> &chunks,
+void writer::impl::init_encoder_pages(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
                                       device_span<gpu::parquet_column_device_view const> col_desc,
                                       device_span<gpu::EncPage> pages,
-                                      statistics_chunk *page_stats,
-                                      statistics_chunk *frag_stats,
+                                      statistics_chunk* page_stats,
+                                      statistics_chunk* frag_stats,
                                       uint32_t num_columns,
                                       uint32_t num_pages,
                                       uint32_t num_stats_bfr)
@@ -795,14 +795,14 @@ void writer::impl::init_encoder_pages(hostdevice_2dvector<gpu::EncColumnChunk> &
   stream.synchronize();
 }
 
-void writer::impl::encode_pages(hostdevice_2dvector<gpu::EncColumnChunk> &chunks,
+void writer::impl::encode_pages(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
                                 device_span<gpu::EncPage> pages,
                                 uint32_t pages_in_batch,
                                 uint32_t first_page_in_batch,
                                 uint32_t rowgroups_in_batch,
                                 uint32_t first_rowgroup,
-                                const statistics_chunk *page_stats,
-                                const statistics_chunk *chunk_stats)
+                                const statistics_chunk* page_stats,
+                                const statistics_chunk* chunk_stats)
 {
   auto batch_pages = pages.subspan(first_page_in_batch, pages_in_batch);
 
@@ -844,10 +844,10 @@ void writer::impl::encode_pages(hostdevice_2dvector<gpu::EncColumnChunk> &chunks
 }
 
 writer::impl::impl(std::unique_ptr<data_sink> sink,
-                   parquet_writer_options const &options,
+                   parquet_writer_options const& options,
                    SingleWriteMode mode,
                    rmm::cuda_stream_view stream,
-                   rmm::mr::device_memory_resource *mr)
+                   rmm::mr::device_memory_resource* mr)
   : _mr(mr),
     stream(stream),
     compression_(to_parquet_compression(options.get_compression())),
@@ -863,10 +863,10 @@ writer::impl::impl(std::unique_ptr<data_sink> sink,
 }
 
 writer::impl::impl(std::unique_ptr<data_sink> sink,
-                   chunked_parquet_writer_options const &options,
+                   chunked_parquet_writer_options const& options,
                    SingleWriteMode mode,
                    rmm::cuda_stream_view stream,
-                   rmm::mr::device_memory_resource *mr)
+                   rmm::mr::device_memory_resource* mr)
   : _mr(mr),
     stream(stream),
     compression_(to_parquet_compression(options.get_compression())),
@@ -892,7 +892,7 @@ void writer::impl::init_state()
   current_chunk_offset = sizeof(file_header_s);
 }
 
-void writer::impl::write(table_view const &table)
+void writer::impl::write(table_view const& table)
 {
   CUDF_EXPECTS(not closed, "Data has already been flushed to out and closed");
 
@@ -901,8 +901,8 @@ void writer::impl::write(table_view const &table)
   if (not table_meta) { table_meta = std::make_unique<table_input_metadata>(table); }
 
   // Fill unnamed columns' names in table_meta
-  std::function<void(column_in_metadata &, std::string)> add_default_name =
-    [&](column_in_metadata &col_meta, std::string default_name) {
+  std::function<void(column_in_metadata&, std::string)> add_default_name =
+    [&](column_in_metadata& col_meta, std::string default_name) {
       if (col_meta.get_name().empty()) col_meta.set_name(default_name);
       for (size_type i = 0; i < col_meta.num_children(); ++i) {
         add_default_name(col_meta.child(i), col_meta.get_name() + "_" + std::to_string(i));
@@ -917,14 +917,16 @@ void writer::impl::write(table_view const &table)
   // Construct parquet_column_views from the schema tree leaf nodes.
   std::vector<parquet_column_view> parquet_columns;
 
-  for (schema_tree_node const &schema_node : schema_tree) {
+  for (schema_tree_node const& schema_node : schema_tree) {
     if (schema_node.leaf_column) { parquet_columns.emplace_back(schema_node, schema_tree, stream); }
   }
 
   // Mass allocation of column_device_views for each parquet_column_view
   std::vector<column_view> cudf_cols;
   cudf_cols.reserve(parquet_columns.size());
-  for (auto const &parq_col : parquet_columns) { cudf_cols.push_back(parq_col.cudf_column_view()); }
+  for (auto const& parq_col : parquet_columns) {
+    cudf_cols.push_back(parq_col.cudf_column_view());
+  }
   table_view single_streams_table(cudf_cols);
   size_type num_columns = single_streams_table.num_columns();
 
@@ -938,7 +940,7 @@ void writer::impl::write(table_view const &table)
     std::transform(table_meta->user_data.begin(),
                    table_meta->user_data.end(),
                    std::back_inserter(md.key_value_metadata),
-                   [](auto const &kv) {
+                   [](auto const& kv) {
                      return KeyValue{kv.first, kv.second};
                    });
     md.schema = this_table_schema;
@@ -960,7 +962,7 @@ void writer::impl::write(table_view const &table)
   // This should've been `auto const&` but isn't since dictionary space is allocated when calling
   // get_device_view(). Fix during dictionary refactor.
   std::transform(
-    parquet_columns.begin(), parquet_columns.end(), col_desc.host_ptr(), [&](auto &pcol) {
+    parquet_columns.begin(), parquet_columns.end(), col_desc.host_ptr(), [&](auto& pcol) {
       return pcol.get_device_view(stream);
     });
 
@@ -1039,7 +1041,7 @@ void writer::impl::write(table_view const &table)
     md.row_groups[global_r].total_byte_size = 0;
     md.row_groups[global_r].columns.resize(num_columns);
     for (int i = 0; i < num_columns; i++) {
-      gpu::EncColumnChunk *ck = &chunks[r][i];
+      gpu::EncColumnChunk* ck = &chunks[r][i];
       bool dict_enable        = false;
 
       *ck           = {};
@@ -1088,7 +1090,9 @@ void writer::impl::write(table_view const &table)
   }
 
   // Free unused dictionaries
-  for (auto &col : parquet_columns) { col.check_dictionary_used(stream); }
+  for (auto& col : parquet_columns) {
+    col.check_dictionary_used(stream);
+  }
 
   // Build chunk dictionaries and count pages
   if (num_chunks != 0) {
@@ -1107,7 +1111,7 @@ void writer::impl::write(table_view const &table)
     size_t rowgroup_size = 0;
     if (r < num_rowgroups) {
       for (int i = 0; i < num_columns; i++) {
-        gpu::EncColumnChunk *ck = &chunks[r][i];
+        gpu::EncColumnChunk* ck = &chunks[r][i];
         ck->first_page          = num_pages;
         num_pages += ck->num_pages;
         pages_in_batch += ck->num_pages;
@@ -1146,11 +1150,11 @@ void writer::impl::write(table_view const &table)
   // This contains stats for both the pages and the rowgroups. TODO: make them separate.
   rmm::device_uvector<statistics_chunk> page_stats(num_stats_bfr, stream);
   for (uint32_t b = 0, r = 0; b < (uint32_t)batch_list.size(); b++) {
-    uint8_t *bfr   = static_cast<uint8_t *>(uncomp_bfr.data());
-    uint8_t *bfr_c = static_cast<uint8_t *>(comp_bfr.data());
+    uint8_t* bfr   = static_cast<uint8_t*>(uncomp_bfr.data());
+    uint8_t* bfr_c = static_cast<uint8_t*>(comp_bfr.data());
     for (uint32_t j = 0; j < batch_list[b]; j++, r++) {
       for (int i = 0; i < num_columns; i++) {
-        gpu::EncColumnChunk *ck = &chunks[r][i];
+        gpu::EncColumnChunk* ck = &chunks[r][i];
         ck->uncompressed_bfr    = bfr;
         ck->compressed_bfr      = bfr_c;
         bfr += ck->bfr_size;
@@ -1194,8 +1198,8 @@ void writer::impl::write(table_view const &table)
                                                                : nullptr);
     for (; r < rnext; r++, global_r++) {
       for (auto i = 0; i < num_columns; i++) {
-        gpu::EncColumnChunk *ck = &chunks[r][i];
-        uint8_t *dev_bfr;
+        gpu::EncColumnChunk* ck = &chunks[r][i];
+        uint8_t* dev_bfr;
         if (ck->is_compressed) {
           md.row_groups[global_r].columns[i].meta_data.codec = compression_;
           dev_bfr                                            = ck->compressed_bfr;
@@ -1220,7 +1224,7 @@ void writer::impl::write(table_view const &table)
         } else {
           if (!host_bfr) {
             host_bfr = pinned_buffer<uint8_t>{[](size_t size) {
-                                                uint8_t *ptr = nullptr;
+                                                uint8_t* ptr = nullptr;
                                                 CUDA_TRY(cudaMallocHost(&ptr, size));
                                                 return ptr;
                                               }(max_chunk_bfr_size),
@@ -1255,7 +1259,7 @@ void writer::impl::write(table_view const &table)
 }
 
 std::unique_ptr<std::vector<uint8_t>> writer::impl::close(
-  std::string const &column_chunks_file_path)
+  std::string const& column_chunks_file_path)
 {
   if (closed) { return nullptr; }
   closed = true;
@@ -1273,15 +1277,17 @@ std::unique_ptr<std::vector<uint8_t>> writer::impl::close(
     file_header_s fhdr = {parquet_magic};
     buffer_.resize(0);
     buffer_.insert(buffer_.end(),
-                   reinterpret_cast<const uint8_t *>(&fhdr),
-                   reinterpret_cast<const uint8_t *>(&fhdr) + sizeof(fhdr));
-    for (auto &rowgroup : md.row_groups) {
-      for (auto &col : rowgroup.columns) { col.file_path = column_chunks_file_path; }
+                   reinterpret_cast<const uint8_t*>(&fhdr),
+                   reinterpret_cast<const uint8_t*>(&fhdr) + sizeof(fhdr));
+    for (auto& rowgroup : md.row_groups) {
+      for (auto& col : rowgroup.columns) {
+        col.file_path = column_chunks_file_path;
+      }
     }
     fendr.footer_len = static_cast<uint32_t>(cpw.write(md));
     buffer_.insert(buffer_.end(),
-                   reinterpret_cast<const uint8_t *>(&fendr),
-                   reinterpret_cast<const uint8_t *>(&fendr) + sizeof(fendr));
+                   reinterpret_cast<const uint8_t*>(&fendr),
+                   reinterpret_cast<const uint8_t*>(&fendr) + sizeof(fendr));
     return std::make_unique<std::vector<uint8_t>>(std::move(buffer_));
   } else {
     return {nullptr};
@@ -1290,19 +1296,19 @@ std::unique_ptr<std::vector<uint8_t>> writer::impl::close(
 
 // Forward to implementation
 writer::writer(std::unique_ptr<data_sink> sink,
-               parquet_writer_options const &options,
+               parquet_writer_options const& options,
                SingleWriteMode mode,
                rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource *mr)
+               rmm::mr::device_memory_resource* mr)
   : _impl(std::make_unique<impl>(std::move(sink), options, mode, stream, mr))
 {
 }
 
 writer::writer(std::unique_ptr<data_sink> sink,
-               chunked_parquet_writer_options const &options,
+               chunked_parquet_writer_options const& options,
                SingleWriteMode mode,
                rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource *mr)
+               rmm::mr::device_memory_resource* mr)
   : _impl(std::make_unique<impl>(std::move(sink), options, mode, stream, mr))
 {
 }
@@ -1311,23 +1317,23 @@ writer::writer(std::unique_ptr<data_sink> sink,
 writer::~writer() = default;
 
 // Forward to implementation
-void writer::write(table_view const &table) { _impl->write(table); }
+void writer::write(table_view const& table) { _impl->write(table); }
 
 // Forward to implementation
-std::unique_ptr<std::vector<uint8_t>> writer::close(std::string const &column_chunks_file_path)
+std::unique_ptr<std::vector<uint8_t>> writer::close(std::string const& column_chunks_file_path)
 {
   return _impl->close(column_chunks_file_path);
 }
 
 std::unique_ptr<std::vector<uint8_t>> writer::merge_rowgroup_metadata(
-  const std::vector<std::unique_ptr<std::vector<uint8_t>>> &metadata_list)
+  const std::vector<std::unique_ptr<std::vector<uint8_t>>>& metadata_list)
 {
   std::vector<uint8_t> output;
   CompactProtocolWriter cpw(&output);
   FileMetaData md;
 
   md.row_groups.reserve(metadata_list.size());
-  for (const auto &blob : metadata_list) {
+  for (const auto& blob : metadata_list) {
     CompactProtocolReader cpreader(
       blob.get()->data(),
       std::max<size_t>(blob.get()->size(), sizeof(file_ender_s)) - sizeof(file_ender_s));
@@ -1356,13 +1362,13 @@ std::unique_ptr<std::vector<uint8_t>> writer::merge_rowgroup_metadata(
   file_ender_s fendr;
   fhdr.magic = parquet_magic;
   output.insert(output.end(),
-                reinterpret_cast<const uint8_t *>(&fhdr),
-                reinterpret_cast<const uint8_t *>(&fhdr) + sizeof(fhdr));
+                reinterpret_cast<const uint8_t*>(&fhdr),
+                reinterpret_cast<const uint8_t*>(&fhdr) + sizeof(fhdr));
   fendr.footer_len = static_cast<uint32_t>(cpw.write(md));
   fendr.magic      = parquet_magic;
   output.insert(output.end(),
-                reinterpret_cast<const uint8_t *>(&fendr),
-                reinterpret_cast<const uint8_t *>(&fendr) + sizeof(fendr));
+                reinterpret_cast<const uint8_t*>(&fendr),
+                reinterpret_cast<const uint8_t*>(&fendr) + sizeof(fendr));
   return std::make_unique<std::vector<uint8_t>>(std::move(output));
 }
 
diff --git a/cpp/src/io/statistics/column_statistics.cuh b/cpp/src/io/statistics/column_statistics.cuh
index fd148724712..333f0e1aae7 100644
--- a/cpp/src/io/statistics/column_statistics.cuh
+++ b/cpp/src/io/statistics/column_statistics.cuh
@@ -60,21 +60,21 @@ using block_reduce_storage = detail::block_reduce_storage<dimension>;
  */
 template <int block_size, detail::io_file_format IO>
 struct calculate_group_statistics_functor {
-  block_reduce_storage<block_size> &temp_storage;
+  block_reduce_storage<block_size>& temp_storage;
 
   /**
    * @brief Construct a statistics calculator
    *
    * @param d_temp_storage Temporary storage to be used by cub calls
    */
-  __device__ calculate_group_statistics_functor(block_reduce_storage<block_size> &d_temp_storage)
+  __device__ calculate_group_statistics_functor(block_reduce_storage<block_size>& d_temp_storage)
     : temp_storage(d_temp_storage)
   {
   }
 
   template <typename T,
-            std::enable_if_t<detail::statistics_type_category<T, IO>::is_ignored> * = nullptr>
-  __device__ void operator()(stats_state_s &s, uint32_t t)
+            std::enable_if_t<detail::statistics_type_category<T, IO>::is_ignored>* = nullptr>
+  __device__ void operator()(stats_state_s& s, uint32_t t)
   {
     // No-op for unsupported aggregation types
   }
@@ -88,8 +88,8 @@ struct calculate_group_statistics_functor {
    * @param t thread id
    */
   template <typename T,
-            std::enable_if_t<not detail::statistics_type_category<T, IO>::is_ignored> * = nullptr>
-  __device__ void operator()(stats_state_s &s, uint32_t t)
+            std::enable_if_t<not detail::statistics_type_category<T, IO>::is_ignored>* = nullptr>
+  __device__ void operator()(stats_state_s& s, uint32_t t)
   {
     detail::storage_wrapper<block_size> storage(temp_storage);
 
@@ -123,17 +123,17 @@ struct calculate_group_statistics_functor {
  */
 template <int block_size, detail::io_file_format IO>
 struct merge_group_statistics_functor {
-  block_reduce_storage<block_size> &temp_storage;
+  block_reduce_storage<block_size>& temp_storage;
 
-  __device__ merge_group_statistics_functor(block_reduce_storage<block_size> &d_temp_storage)
+  __device__ merge_group_statistics_functor(block_reduce_storage<block_size>& d_temp_storage)
     : temp_storage(d_temp_storage)
   {
   }
 
   template <typename T,
-            std::enable_if_t<detail::statistics_type_category<T, IO>::is_ignored> * = nullptr>
-  __device__ void operator()(merge_state_s &s,
-                             const statistics_chunk *chunks,
+            std::enable_if_t<detail::statistics_type_category<T, IO>::is_ignored>* = nullptr>
+  __device__ void operator()(merge_state_s& s,
+                             const statistics_chunk* chunks,
                              const uint32_t num_chunks,
                              uint32_t t)
   {
@@ -141,9 +141,9 @@ struct merge_group_statistics_functor {
   }
 
   template <typename T,
-            std::enable_if_t<not detail::statistics_type_category<T, IO>::is_ignored> * = nullptr>
-  __device__ void operator()(merge_state_s &s,
-                             const statistics_chunk *chunks,
+            std::enable_if_t<not detail::statistics_type_category<T, IO>::is_ignored>* = nullptr>
+  __device__ void operator()(merge_state_s& s,
+                             const statistics_chunk* chunks,
                              const uint32_t num_chunks,
                              uint32_t t)
   {
@@ -151,7 +151,9 @@ struct merge_group_statistics_functor {
 
     typed_statistics_chunk<T, detail::statistics_type_category<T, IO>::is_aggregated> chunk;
 
-    for (uint32_t i = t; i < num_chunks; i += block_size) { chunk.reduce(chunks[i]); }
+    for (uint32_t i = t; i < num_chunks; i += block_size) {
+      chunk.reduce(chunks[i]);
+    }
     chunk.has_minmax = (chunk.minimum_value <= chunk.maximum_value);
 
     chunk = block_reduce(chunk, storage);
@@ -170,17 +172,16 @@ struct merge_group_statistics_functor {
  * @tparam T Type of object
  */
 template <typename T>
-__device__ void cooperative_load(T &destination, const T *source = nullptr)
+__device__ void cooperative_load(T& destination, const T* source = nullptr)
 {
   using load_type = std::conditional_t<((sizeof(T) % sizeof(uint32_t)) == 0), uint32_t, uint8_t>;
   if (source == nullptr) {
     for (auto i = threadIdx.x; i < (sizeof(T) / sizeof(load_type)); i += blockDim.x) {
-      reinterpret_cast<load_type *>(&destination)[i] = load_type{0};
+      reinterpret_cast<load_type*>(&destination)[i] = load_type{0};
     }
   } else {
     for (auto i = threadIdx.x; i < sizeof(T) / sizeof(load_type); i += blockDim.x) {
-      reinterpret_cast<load_type *>(&destination)[i] =
-        reinterpret_cast<const load_type *>(source)[i];
+      reinterpret_cast<load_type*>(&destination)[i] = reinterpret_cast<const load_type*>(source)[i];
     }
   }
 }
@@ -195,7 +196,7 @@ __device__ void cooperative_load(T &destination, const T *source = nullptr)
  */
 template <int block_size, detail::io_file_format IO>
 __global__ void __launch_bounds__(block_size, 1)
-  gpu_calculate_group_statistics(statistics_chunk *chunks, const statistics_group *groups)
+  gpu_calculate_group_statistics(statistics_chunk* chunks, const statistics_group* groups)
 {
   __shared__ __align__(8) stats_state_s state;
   __shared__ block_reduce_storage<block_size> storage;
@@ -229,8 +230,8 @@ namespace detail {
  * @tparam IO File format for which statistics calculation is being done
  */
 template <detail::io_file_format IO>
-void calculate_group_statistics(statistics_chunk *chunks,
-                                const statistics_group *groups,
+void calculate_group_statistics(statistics_chunk* chunks,
+                                const statistics_group* groups,
                                 uint32_t num_chunks,
                                 rmm::cuda_stream_view stream)
 {
@@ -250,9 +251,9 @@ void calculate_group_statistics(statistics_chunk *chunks,
  */
 template <int block_size, detail::io_file_format IO>
 __global__ void __launch_bounds__(block_size, 1)
-  gpu_merge_group_statistics(statistics_chunk *chunks_out,
-                             const statistics_chunk *chunks_in,
-                             const statistics_merge_group *groups)
+  gpu_merge_group_statistics(statistics_chunk* chunks_out,
+                             const statistics_chunk* chunks_in,
+                             const statistics_merge_group* groups)
 {
   __shared__ __align__(8) merge_state_s state;
   __shared__ block_reduce_storage<block_size> storage;
@@ -284,9 +285,9 @@ __global__ void __launch_bounds__(block_size, 1)
  * @tparam IO File format for which statistics calculation is being done
  */
 template <detail::io_file_format IO>
-void merge_group_statistics(statistics_chunk *chunks_out,
-                            const statistics_chunk *chunks_in,
-                            const statistics_merge_group *groups,
+void merge_group_statistics(statistics_chunk* chunks_out,
+                            const statistics_chunk* chunks_in,
+                            const statistics_merge_group* groups,
                             uint32_t num_chunks,
                             rmm::cuda_stream_view stream)
 {
diff --git a/cpp/src/io/statistics/conversion_type_select.cuh b/cpp/src/io/statistics/conversion_type_select.cuh
index 225377bfc4b..b76a5fcf3cd 100644
--- a/cpp/src/io/statistics/conversion_type_select.cuh
+++ b/cpp/src/io/statistics/conversion_type_select.cuh
@@ -70,7 +70,7 @@ template <typename>
 class Detect;
 
 /**
- * @brief Utility class to detect multiple occurences of a type in the first element of pairs in a
+ * @brief Utility class to detect multiple occurrences of a type in the first element of pairs in a
  * tuple For eg. with the following tuple :
  *
  * using conversion_types =
diff --git a/cpp/src/io/statistics/orc_column_statistics.cu b/cpp/src/io/statistics/orc_column_statistics.cu
index ad8a05a56f5..9e0dc1c1b7d 100644
--- a/cpp/src/io/statistics/orc_column_statistics.cu
+++ b/cpp/src/io/statistics/orc_column_statistics.cu
@@ -26,14 +26,14 @@ namespace io {
 namespace detail {
 
 template <>
-void merge_group_statistics<detail::io_file_format::ORC>(statistics_chunk *chunks_out,
-                                                         const statistics_chunk *chunks_in,
-                                                         const statistics_merge_group *groups,
+void merge_group_statistics<detail::io_file_format::ORC>(statistics_chunk* chunks_out,
+                                                         const statistics_chunk* chunks_in,
+                                                         const statistics_merge_group* groups,
                                                          uint32_t num_chunks,
                                                          rmm::cuda_stream_view stream);
 template <>
-void calculate_group_statistics<detail::io_file_format::ORC>(statistics_chunk *chunks,
-                                                             const statistics_group *groups,
+void calculate_group_statistics<detail::io_file_format::ORC>(statistics_chunk* chunks,
+                                                             const statistics_group* groups,
                                                              uint32_t num_chunks,
                                                              rmm::cuda_stream_view stream);
 
diff --git a/cpp/src/io/statistics/parquet_column_statistics.cu b/cpp/src/io/statistics/parquet_column_statistics.cu
index ad067cd4aad..525065576de 100644
--- a/cpp/src/io/statistics/parquet_column_statistics.cu
+++ b/cpp/src/io/statistics/parquet_column_statistics.cu
@@ -26,14 +26,14 @@ namespace io {
 namespace detail {
 
 template <>
-void merge_group_statistics<detail::io_file_format::PARQUET>(statistics_chunk *chunks_out,
-                                                             const statistics_chunk *chunks_in,
-                                                             const statistics_merge_group *groups,
+void merge_group_statistics<detail::io_file_format::PARQUET>(statistics_chunk* chunks_out,
+                                                             const statistics_chunk* chunks_in,
+                                                             const statistics_merge_group* groups,
                                                              uint32_t num_chunks,
                                                              rmm::cuda_stream_view stream);
 template <>
-void calculate_group_statistics<detail::io_file_format::PARQUET>(statistics_chunk *chunks,
-                                                                 const statistics_group *groups,
+void calculate_group_statistics<detail::io_file_format::PARQUET>(statistics_chunk* chunks,
+                                                                 const statistics_group* groups,
                                                                  uint32_t num_chunks,
                                                                  rmm::cuda_stream_view stream);
 
diff --git a/cpp/src/io/statistics/statistics.cuh b/cpp/src/io/statistics/statistics.cuh
index f7bf6e407c1..c60e4eebaa0 100644
--- a/cpp/src/io/statistics/statistics.cuh
+++ b/cpp/src/io/statistics/statistics.cuh
@@ -53,15 +53,15 @@ struct stats_column_desc {
                         //!< nested columns
   int32_t ts_scale;     //!< timestamp scale (>0: multiply by scale, <0: divide by -scale)
 
-  column_device_view *leaf_column;    //!< Pointer to leaf column
-  column_device_view *parent_column;  //!< Pointer to parent column. Is nullptr if not list type.
+  column_device_view* leaf_column;    //!< Pointer to leaf column
+  column_device_view* parent_column;  //!< Pointer to parent column. Is nullptr if not list type.
 };
 
 struct string_stats {
-  const char *ptr;  //!< ptr to character data
+  const char* ptr;  //!< ptr to character data
   uint32_t length;  //!< length of string
-  __host__ __device__ __forceinline__ volatile string_stats &operator=(
-    const string_view &val) volatile
+  __host__ __device__ __forceinline__ volatile string_stats& operator=(
+    const string_view& val) volatile
   {
     ptr    = val.data();
     length = val.size_bytes();
@@ -99,13 +99,13 @@ struct statistics_chunk {
 };
 
 struct statistics_group {
-  const stats_column_desc *col;  //!< Column information
+  const stats_column_desc* col;  //!< Column information
   uint32_t start_row;            //!< Start row of this group
   uint32_t num_rows;             //!< Number of rows in group
 };
 
 struct statistics_merge_group {
-  const stats_column_desc *col;  //!< Column information
+  const stats_column_desc* col;  //!< Column information
   uint32_t start_chunk;          //!< Start chunk of this group
   uint32_t num_chunks;           //!< Number of chunks in group
 };
diff --git a/cpp/src/io/statistics/statistics_type_identification.cuh b/cpp/src/io/statistics/statistics_type_identification.cuh
index 84399a307a5..869e2833285 100644
--- a/cpp/src/io/statistics/statistics_type_identification.cuh
+++ b/cpp/src/io/statistics/statistics_type_identification.cuh
@@ -55,8 +55,8 @@ struct conversion_map<io_file_format::ORC> {
                            std::pair<cudf::duration_ns, cudf::duration_ms>>;
 };
 
-// In Parquet timestamps and durations with second resoluion are converted to
-// milliseconds. Timestamps and durations with nanosecond resoluion are
+// In Parquet timestamps and durations with second resolution are converted to
+// milliseconds. Timestamps and durations with nanosecond resolution are
 // converted to microseconds.
 template <>
 struct conversion_map<io_file_format::PARQUET> {
diff --git a/cpp/src/io/utilities/block_utils.cuh b/cpp/src/io/utilities/block_utils.cuh
index 759aa2517b6..2b4f69df10f 100644
--- a/cpp/src/io/utilities/block_utils.cuh
+++ b/cpp/src/io/utilities/block_utils.cuh
@@ -124,18 +124,18 @@ inline __device__ double Int128ToDouble_rn(uint64_t lo, int64_t hi)
   return sign * __fma_rn(__ll2double_rn(hi), 4294967296.0 * 4294967296.0, __ull2double_rn(lo));
 }
 
-inline __device__ uint32_t unaligned_load32(const uint8_t *p)
+inline __device__ uint32_t unaligned_load32(const uint8_t* p)
 {
   uint32_t ofs        = 3 & reinterpret_cast<uintptr_t>(p);
-  const uint32_t *p32 = reinterpret_cast<const uint32_t *>(p - ofs);
+  const uint32_t* p32 = reinterpret_cast<const uint32_t*>(p - ofs);
   uint32_t v          = p32[0];
   return (ofs) ? __funnelshift_r(v, p32[1], ofs * 8) : v;
 }
 
-inline __device__ uint64_t unaligned_load64(const uint8_t *p)
+inline __device__ uint64_t unaligned_load64(const uint8_t* p)
 {
   uint32_t ofs        = 3 & reinterpret_cast<uintptr_t>(p);
-  const uint32_t *p32 = reinterpret_cast<const uint32_t *>(p - ofs);
+  const uint32_t* p32 = reinterpret_cast<const uint32_t*>(p - ofs);
   uint32_t v0         = p32[0];
   uint32_t v1         = p32[1];
   if (ofs) {
@@ -146,10 +146,10 @@ inline __device__ uint64_t unaligned_load64(const uint8_t *p)
 }
 
 template <unsigned int nthreads, bool sync_before_store>
-inline __device__ void memcpy_block(void *dstv, const void *srcv, uint32_t len, uint32_t t)
+inline __device__ void memcpy_block(void* dstv, const void* srcv, uint32_t len, uint32_t t)
 {
-  uint8_t *dst       = static_cast<uint8_t *>(dstv);
-  const uint8_t *src = static_cast<const uint8_t *>(srcv);
+  uint8_t* dst       = static_cast<uint8_t*>(dstv);
+  const uint8_t* src = static_cast<const uint8_t*>(srcv);
   uint32_t dst_align_bytes, src_align_bytes, src_align_bits;
   // Align output to 32-bit
   dst_align_bytes = 3 & -reinterpret_cast<intptr_t>(dst);
@@ -166,7 +166,7 @@ inline __device__ void memcpy_block(void *dstv, const void *srcv, uint32_t len,
   src_align_bytes = (uint32_t)(3 & reinterpret_cast<uintptr_t>(src));
   src_align_bits  = src_align_bytes * 8;
   while (len >= 4) {
-    const uint32_t *src32 = reinterpret_cast<const uint32_t *>(src - src_align_bytes);
+    const uint32_t* src32 = reinterpret_cast<const uint32_t*>(src - src_align_bytes);
     uint32_t copy_cnt     = min(len >> 2, nthreads);
     uint32_t v;
     if (t < copy_cnt) {
@@ -174,7 +174,7 @@ inline __device__ void memcpy_block(void *dstv, const void *srcv, uint32_t len,
       if (src_align_bits != 0) { v = __funnelshift_r(v, src32[t + 1], src_align_bits); }
     }
     if (sync_before_store) { __syncthreads(); }
-    if (t < copy_cnt) { reinterpret_cast<uint32_t *>(dst)[t] = v; }
+    if (t < copy_cnt) { reinterpret_cast<uint32_t*>(dst)[t] = v; }
     src += copy_cnt * 4;
     dst += copy_cnt * 4;
     len -= copy_cnt * 4;
diff --git a/cpp/src/io/utilities/column_utils.cuh b/cpp/src/io/utilities/column_utils.cuh
index c08f42583ef..03ea041706a 100644
--- a/cpp/src/io/utilities/column_utils.cuh
+++ b/cpp/src/io/utilities/column_utils.cuh
@@ -49,7 +49,7 @@ namespace io {
 template <typename ColumnDescriptor>
 rmm::device_uvector<column_device_view> create_leaf_column_device_views(
   typename cudf::device_span<ColumnDescriptor> col_desc,
-  const table_device_view &parent_table_device_view,
+  const table_device_view& parent_table_device_view,
   rmm::cuda_stream_view stream)
 {
   rmm::device_uvector<column_device_view> leaf_column_views(parent_table_device_view.num_columns(),
@@ -71,7 +71,7 @@ rmm::device_uvector<column_device_view> create_leaf_column_device_views(
                                                  : col.child(0);
       }
       // Store leaf_column to device storage
-      column_device_view *leaf_col_ptr = leaf_columns.begin() + index;
+      column_device_view* leaf_col_ptr = leaf_columns.begin() + index;
       *leaf_col_ptr                    = col;
       col_desc[index].leaf_column      = leaf_col_ptr;
     });
diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp
index ac8deccd078..4b23d008344 100644
--- a/cpp/src/io/utilities/datasource.cpp
+++ b/cpp/src/io/utilities/datasource.cpp
@@ -32,7 +32,7 @@ namespace {
  */
 class file_source : public datasource {
  public:
-  explicit file_source(const char *filepath)
+  explicit file_source(const char* filepath)
     : _file(filepath, O_RDONLY), _cufile_in(detail::make_cufile_input(filepath))
   {
   }
@@ -58,7 +58,7 @@ class file_source : public datasource {
 
   size_t device_read(size_t offset,
                      size_t size,
-                     uint8_t *dst,
+                     uint8_t* dst,
                      rmm::cuda_stream_view stream) override
   {
     CUDF_EXPECTS(supports_device_read(), "Device reads are not supported for this file.");
@@ -84,7 +84,7 @@ class file_source : public datasource {
  */
 class memory_mapped_source : public file_source {
  public:
-  explicit memory_mapped_source(const char *filepath, size_t offset, size_t size)
+  explicit memory_mapped_source(const char* filepath, size_t offset, size_t size)
     : file_source(filepath)
   {
     if (_file.size() != 0) map(_file.desc(), offset, size);
@@ -103,17 +103,17 @@ class memory_mapped_source : public file_source {
     auto const read_size = std::min(size, _map_size - (offset - _map_offset));
 
     return std::make_unique<non_owning_buffer>(
-      static_cast<uint8_t *>(_map_addr) + (offset - _map_offset), read_size);
+      static_cast<uint8_t*>(_map_addr) + (offset - _map_offset), read_size);
   }
 
-  size_t host_read(size_t offset, size_t size, uint8_t *dst) override
+  size_t host_read(size_t offset, size_t size, uint8_t* dst) override
   {
     CUDF_EXPECTS(offset >= _map_offset, "Requested offset is outside mapping");
 
     // Clamp length to available data in the mapped region
     auto const read_size = std::min(size, _map_size - (offset - _map_offset));
 
-    auto const src = static_cast<uint8_t *>(_map_addr) + (offset - _map_offset);
+    auto const src = static_cast<uint8_t*>(_map_addr) + (offset - _map_offset);
     std::memcpy(dst, src, read_size);
     return read_size;
   }
@@ -139,7 +139,7 @@ class memory_mapped_source : public file_source {
  private:
   size_t _map_size   = 0;
   size_t _map_offset = 0;
-  void *_map_addr    = nullptr;
+  void* _map_addr    = nullptr;
 };
 
 /**
@@ -150,7 +150,7 @@ class memory_mapped_source : public file_source {
  */
 class direct_read_source : public file_source {
  public:
-  explicit direct_read_source(const char *filepath) : file_source(filepath) {}
+  explicit direct_read_source(const char* filepath) : file_source(filepath) {}
 
   std::unique_ptr<buffer> host_read(size_t offset, size_t size) override
   {
@@ -164,7 +164,7 @@ class direct_read_source : public file_source {
     return buffer::create(std::move(v));
   }
 
-  size_t host_read(size_t offset, size_t size, uint8_t *dst) override
+  size_t host_read(size_t offset, size_t size, uint8_t* dst) override
   {
     lseek(_file.desc(), offset, SEEK_SET);
 
@@ -186,9 +186,9 @@ class direct_read_source : public file_source {
  */
 class user_datasource_wrapper : public datasource {
  public:
-  explicit user_datasource_wrapper(datasource *const source) : source(source) {}
+  explicit user_datasource_wrapper(datasource* const source) : source(source) {}
 
-  size_t host_read(size_t offset, size_t size, uint8_t *dst) override
+  size_t host_read(size_t offset, size_t size, uint8_t* dst) override
   {
     return source->host_read(offset, size, dst);
   }
@@ -202,7 +202,7 @@ class user_datasource_wrapper : public datasource {
 
   size_t device_read(size_t offset,
                      size_t size,
-                     uint8_t *dst,
+                     uint8_t* dst,
                      rmm::cuda_stream_view stream) override
   {
     return source->device_read(offset, size, dst, stream);
@@ -218,12 +218,12 @@ class user_datasource_wrapper : public datasource {
   size_t size() const override { return source->size(); }
 
  private:
-  datasource *const source;  ///< A non-owning pointer to the user-implemented datasource
+  datasource* const source;  ///< A non-owning pointer to the user-implemented datasource
 };
 
 }  // namespace
 
-std::unique_ptr<datasource> datasource::create(const std::string &filepath,
+std::unique_ptr<datasource> datasource::create(const std::string& filepath,
                                                size_t offset,
                                                size_t size)
 {
@@ -237,14 +237,14 @@ std::unique_ptr<datasource> datasource::create(const std::string &filepath,
   return std::make_unique<memory_mapped_source>(filepath.c_str(), offset, size);
 }
 
-std::unique_ptr<datasource> datasource::create(host_buffer const &buffer)
+std::unique_ptr<datasource> datasource::create(host_buffer const& buffer)
 {
   // Use Arrow IO buffer class for zero-copy reads of host memory
   return std::make_unique<arrow_io_source>(std::make_shared<arrow::io::BufferReader>(
-    reinterpret_cast<const uint8_t *>(buffer.data), buffer.size));
+    reinterpret_cast<const uint8_t*>(buffer.data), buffer.size));
 }
 
-std::unique_ptr<datasource> datasource::create(datasource *source)
+std::unique_ptr<datasource> datasource::create(datasource* source)
 {
   // instantiate a wrapper that forwards the calls to the user implementation
   return std::make_unique<user_datasource_wrapper>(source);
diff --git a/cpp/src/io/utilities/file_io_utilities.cpp b/cpp/src/io/utilities/file_io_utilities.cpp
index abf3a3fdef0..b5fb9fb51bc 100644
--- a/cpp/src/io/utilities/file_io_utilities.cpp
+++ b/cpp/src/io/utilities/file_io_utilities.cpp
@@ -32,13 +32,13 @@ size_t get_file_size(int file_descriptor)
   return static_cast<size_t>(st.st_size);
 }
 
-file_wrapper::file_wrapper(std::string const &filepath, int flags)
+file_wrapper::file_wrapper(std::string const& filepath, int flags)
   : fd(open(filepath.c_str(), flags)), _size{get_file_size(fd)}
 {
   CUDF_EXPECTS(fd != -1, "Cannot open file " + filepath);
 }
 
-file_wrapper::file_wrapper(std::string const &filepath, int flags, mode_t mode)
+file_wrapper::file_wrapper(std::string const& filepath, int flags, mode_t mode)
   : fd(open(filepath.c_str(), flags, mode)), _size{get_file_size(fd)}
 {
   CUDF_EXPECTS(fd != -1, "Cannot open file " + filepath);
@@ -46,7 +46,7 @@ file_wrapper::file_wrapper(std::string const &filepath, int flags, mode_t mode)
 
 file_wrapper::~file_wrapper() { close(fd); }
 
-std::string getenv_or(std::string const &env_var_name, std::string const &default_val)
+std::string getenv_or(std::string const& env_var_name, std::string const& default_val)
 {
   auto const env_val = std::getenv(env_var_name.c_str());
   return (env_val == nullptr) ? default_val : std::string(env_val);
@@ -81,7 +81,7 @@ cufile_config::cufile_config() : policy{getenv_or("LIBCUDF_CUFILE_POLICY", defau
     }
   }
 }
-cufile_config const *cufile_config::instance()
+cufile_config const* cufile_config::instance()
 {
   static cufile_config _instance;
   return &_instance;
@@ -94,18 +94,18 @@ class cufile_shim {
  private:
   cufile_shim();
 
-  void *cf_lib                              = nullptr;
-  decltype(cuFileDriverOpen) *driver_open   = nullptr;
-  decltype(cuFileDriverClose) *driver_close = nullptr;
+  void* cf_lib                              = nullptr;
+  decltype(cuFileDriverOpen)* driver_open   = nullptr;
+  decltype(cuFileDriverClose)* driver_close = nullptr;
 
   std::unique_ptr<cudf::logic_error> init_error;
   auto is_valid() const noexcept { return init_error == nullptr; }
 
  public:
-  cufile_shim(cufile_shim const &) = delete;
-  cufile_shim &operator=(cufile_shim const &) = delete;
+  cufile_shim(cufile_shim const&) = delete;
+  cufile_shim& operator=(cufile_shim const&) = delete;
 
-  static cufile_shim const *instance();
+  static cufile_shim const* instance();
 
   ~cufile_shim()
   {
@@ -113,10 +113,10 @@ class cufile_shim {
     dlclose(cf_lib);
   }
 
-  decltype(cuFileHandleRegister) *handle_register     = nullptr;
-  decltype(cuFileHandleDeregister) *handle_deregister = nullptr;
-  decltype(cuFileRead) *read                          = nullptr;
-  decltype(cuFileWrite) *write                        = nullptr;
+  decltype(cuFileHandleRegister)* handle_register     = nullptr;
+  decltype(cuFileHandleDeregister)* handle_deregister = nullptr;
+  decltype(cuFileRead)* read                          = nullptr;
+  decltype(cuFileWrite)* write                        = nullptr;
 };
 
 cufile_shim::cufile_shim()
@@ -140,12 +140,12 @@ cufile_shim::cufile_shim()
     CUDF_EXPECTS(write != nullptr, "could not find cuFile cuFileWrite symbol");
 
     CUDF_EXPECTS(driver_open().err == CU_FILE_SUCCESS, "Failed to initialize cuFile driver");
-  } catch (cudf::logic_error const &err) {
+  } catch (cudf::logic_error const& err) {
     init_error = std::make_unique<cudf::logic_error>(err);
   }
 }
 
-cufile_shim const *cufile_shim::instance()
+cufile_shim const* cufile_shim::instance()
 {
   static cufile_shim _instance;
   // Defer throwing to avoid repeated attempts to load the library
@@ -165,7 +165,7 @@ void cufile_registered_file::register_handle()
 
 cufile_registered_file::~cufile_registered_file() { shim->handle_deregister(cf_handle); }
 
-cufile_input_impl::cufile_input_impl(std::string const &filepath)
+cufile_input_impl::cufile_input_impl(std::string const& filepath)
   : shim{cufile_shim::instance()}, cf_file(shim, filepath, O_RDONLY | O_DIRECT)
 {
 }
@@ -183,7 +183,7 @@ std::unique_ptr<datasource::buffer> cufile_input_impl::read(size_t offset,
 
 size_t cufile_input_impl::read(size_t offset,
                                size_t size,
-                               uint8_t *dst,
+                               uint8_t* dst,
                                rmm::cuda_stream_view stream)
 {
   CUDF_EXPECTS(shim->read(cf_file.handle(), dst, size, offset, 0) != -1,
@@ -192,19 +192,19 @@ size_t cufile_input_impl::read(size_t offset,
   return size;
 }
 
-cufile_output_impl::cufile_output_impl(std::string const &filepath)
+cufile_output_impl::cufile_output_impl(std::string const& filepath)
   : shim{cufile_shim::instance()}, cf_file(shim, filepath, O_CREAT | O_RDWR | O_DIRECT, 0664)
 {
 }
 
-void cufile_output_impl::write(void const *data, size_t offset, size_t size)
+void cufile_output_impl::write(void const* data, size_t offset, size_t size)
 {
   CUDF_EXPECTS(shim->write(cf_file.handle(), data, size, offset, 0) != -1,
                "cuFile error writing to a file");
 }
 #endif
 
-std::unique_ptr<cufile_input_impl> make_cufile_input(std::string const &filepath)
+std::unique_ptr<cufile_input_impl> make_cufile_input(std::string const& filepath)
 {
 #ifdef CUFILE_FOUND
   if (cufile_config::instance()->is_enabled()) {
@@ -218,7 +218,7 @@ std::unique_ptr<cufile_input_impl> make_cufile_input(std::string const &filepath
   return nullptr;
 }
 
-std::unique_ptr<cufile_output_impl> make_cufile_output(std::string const &filepath)
+std::unique_ptr<cufile_output_impl> make_cufile_output(std::string const& filepath)
 {
 #ifdef CUFILE_FOUND
   if (cufile_config::instance()->is_enabled()) {
diff --git a/cpp/src/io/utilities/file_io_utilities.hpp b/cpp/src/io/utilities/file_io_utilities.hpp
index 8a742076338..e92191095e3 100644
--- a/cpp/src/io/utilities/file_io_utilities.hpp
+++ b/cpp/src/io/utilities/file_io_utilities.hpp
@@ -40,8 +40,8 @@ class file_wrapper {
   size_t _size;
 
  public:
-  explicit file_wrapper(std::string const &filepath, int flags);
-  explicit file_wrapper(std::string const &filepath, int flags, mode_t mode);
+  explicit file_wrapper(std::string const& filepath, int flags);
+  explicit file_wrapper(std::string const& filepath, int flags, mode_t mode);
   ~file_wrapper();
   auto size() const { return _size; }
   auto desc() const { return fd; }
@@ -105,7 +105,7 @@ class cufile_input : public cufile_io_base {
    *
    * @return The number of bytes read
    */
-  virtual size_t read(size_t offset, size_t size, uint8_t *dst, rmm::cuda_stream_view stream) = 0;
+  virtual size_t read(size_t offset, size_t size, uint8_t* dst, rmm::cuda_stream_view stream) = 0;
 };
 
 /**
@@ -122,7 +122,7 @@ class cufile_output : public cufile_io_base {
    * @param offset Number of bytes from the start
    * @param size Number of bytes to write
    */
-  virtual void write(void const *data, size_t offset, size_t size) = 0;
+  virtual void write(void const* data, size_t offset, size_t size) = 0;
 };
 
 #ifdef CUFILE_FOUND
@@ -152,7 +152,7 @@ class cufile_config {
    */
   bool is_required() const { return policy == "ALWAYS"; }
 
-  static cufile_config const *instance();
+  static cufile_config const* instance();
 };
 
 /**
@@ -162,14 +162,14 @@ struct cufile_registered_file {
   void register_handle();
 
  public:
-  cufile_registered_file(cufile_shim const *shim, std::string const &filepath, int flags)
+  cufile_registered_file(cufile_shim const* shim, std::string const& filepath, int flags)
     : _file(filepath, flags), shim{shim}
   {
     register_handle();
   }
 
-  cufile_registered_file(cufile_shim const *shim,
-                         std::string const &filepath,
+  cufile_registered_file(cufile_shim const* shim,
+                         std::string const& filepath,
                          int flags,
                          mode_t mode)
     : _file(filepath, flags, mode), shim{shim}
@@ -177,14 +177,14 @@ struct cufile_registered_file {
     register_handle();
   }
 
-  auto const &handle() const noexcept { return cf_handle; }
+  auto const& handle() const noexcept { return cf_handle; }
 
   ~cufile_registered_file();
 
  private:
   file_wrapper const _file;
   CUfileHandle_t cf_handle = nullptr;
-  cufile_shim const *shim  = nullptr;
+  cufile_shim const* shim  = nullptr;
 };
 
 /**
@@ -194,16 +194,16 @@ struct cufile_registered_file {
  */
 class cufile_input_impl final : public cufile_input {
  public:
-  cufile_input_impl(std::string const &filepath);
+  cufile_input_impl(std::string const& filepath);
 
   std::unique_ptr<datasource::buffer> read(size_t offset,
                                            size_t size,
                                            rmm::cuda_stream_view stream) override;
 
-  size_t read(size_t offset, size_t size, uint8_t *dst, rmm::cuda_stream_view stream) override;
+  size_t read(size_t offset, size_t size, uint8_t* dst, rmm::cuda_stream_view stream) override;
 
  private:
-  cufile_shim const *shim = nullptr;
+  cufile_shim const* shim = nullptr;
   cufile_registered_file const cf_file;
 };
 
@@ -214,12 +214,12 @@ class cufile_input_impl final : public cufile_input {
  */
 class cufile_output_impl final : public cufile_output {
  public:
-  cufile_output_impl(std::string const &filepath);
+  cufile_output_impl(std::string const& filepath);
 
-  void write(void const *data, size_t offset, size_t size) override;
+  void write(void const* data, size_t offset, size_t size) override;
 
  private:
-  cufile_shim const *shim = nullptr;
+  cufile_shim const* shim = nullptr;
   cufile_registered_file const cf_file;
 };
 #else
@@ -233,7 +233,7 @@ class cufile_input_impl final : public cufile_input {
     CUDF_FAIL("Only used to compile without cufile library, should not be called");
   }
 
-  size_t read(size_t offset, size_t size, uint8_t *dst, rmm::cuda_stream_view stream) override
+  size_t read(size_t offset, size_t size, uint8_t* dst, rmm::cuda_stream_view stream) override
   {
     CUDF_FAIL("Only used to compile without cufile library, should not be called");
   }
@@ -241,7 +241,7 @@ class cufile_input_impl final : public cufile_input {
 
 class cufile_output_impl final : public cufile_output {
  public:
-  void write(void const *data, size_t offset, size_t size) override
+  void write(void const* data, size_t offset, size_t size) override
   {
     CUDF_FAIL("Only used to compile without cufile library, should not be called");
   }
@@ -254,7 +254,7 @@ class cufile_output_impl final : public cufile_output {
  * Returns a null pointer if an exception occurs in the `cufile_input_impl` constructor, or if the
  * cuFile library is not installed.
  */
-std::unique_ptr<cufile_input_impl> make_cufile_input(std::string const &filepath);
+std::unique_ptr<cufile_input_impl> make_cufile_input(std::string const& filepath);
 
 /**
  * @brief Creates a `cufile_output_impl` object
@@ -262,7 +262,7 @@ std::unique_ptr<cufile_input_impl> make_cufile_input(std::string const &filepath
  * Returns a null pointer if an exception occurs in the `cufile_output_impl` constructor, or if the
  * cuFile library is not installed.
  */
-std::unique_ptr<cufile_output_impl> make_cufile_output(std::string const &filepath);
+std::unique_ptr<cufile_output_impl> make_cufile_output(std::string const& filepath);
 
 }  // namespace detail
 }  // namespace io
diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp
index ee4b23bf831..147e53ba32b 100644
--- a/cpp/src/io/utilities/hostdevice_vector.hpp
+++ b/cpp/src/io/utilities/hostdevice_vector.hpp
@@ -38,8 +38,8 @@ class hostdevice_vector {
 
   hostdevice_vector() {}
 
-  hostdevice_vector(hostdevice_vector &&v) { move(std::move(v)); }
-  hostdevice_vector &operator=(hostdevice_vector &&v)
+  hostdevice_vector(hostdevice_vector&& v) { move(std::move(v)); }
+  hostdevice_vector& operator=(hostdevice_vector&& v)
   {
     move(std::move(v));
     return *this;
@@ -70,7 +70,7 @@ class hostdevice_vector {
     }
   }
 
-  bool insert(const T &data)
+  bool insert(const T& data)
   {
     if (num_elements < max_elements) {
       h_data[num_elements] = data;
@@ -84,12 +84,12 @@ class hostdevice_vector {
   size_t size() const noexcept { return num_elements; }
   size_t memory_size() const noexcept { return sizeof(T) * num_elements; }
 
-  T &operator[](size_t i) const { return h_data[i]; }
-  T *host_ptr(size_t offset = 0) const { return h_data + offset; }
-  T *device_ptr(size_t offset = 0) { return reinterpret_cast<T *>(d_data.data()) + offset; }
-  T const *device_ptr(size_t offset = 0) const
+  T& operator[](size_t i) const { return h_data[i]; }
+  T* host_ptr(size_t offset = 0) const { return h_data + offset; }
+  T* device_ptr(size_t offset = 0) { return reinterpret_cast<T*>(d_data.data()) + offset; }
+  T const* device_ptr(size_t offset = 0) const
   {
-    return reinterpret_cast<T const *>(d_data.data()) + offset;
+    return reinterpret_cast<T const*>(d_data.data()) + offset;
   }
 
   operator cudf::device_span<T>() { return {device_ptr(), max_elements}; }
@@ -113,7 +113,7 @@ class hostdevice_vector {
   }
 
  private:
-  void move(hostdevice_vector &&v)
+  void move(hostdevice_vector&& v)
   {
     stream       = v.stream;
     max_elements = v.max_elements;
@@ -129,7 +129,7 @@ class hostdevice_vector {
   rmm::cuda_stream_view stream{};
   size_t max_elements{};
   size_t num_elements{};
-  T *h_data{};
+  T* h_data{};
   rmm::device_buffer d_data{};
 };
 
@@ -175,6 +175,15 @@ class hostdevice_2dvector {
 
   auto size() const noexcept { return _size; }
 
+  T* base_host_ptr(size_t offset = 0) { return _data.host_ptr(offset); }
+  T* base_device_ptr(size_t offset = 0) { return _data.device_ptr(offset); }
+
+  T const* base_host_ptr(size_t offset = 0) const { return _data.host_ptr(offset); }
+
+  T const* base_device_ptr(size_t offset = 0) const { return _data.device_ptr(offset); }
+
+  size_t memory_size() const noexcept { return _data.memory_size(); }
+
   void host_to_device(rmm::cuda_stream_view stream, bool synchronize = false)
   {
     _data.host_to_device(stream, synchronize);
diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh
index c7eae48cbbc..a6b4978aeab 100644
--- a/cpp/src/io/utilities/parsing_utils.cuh
+++ b/cpp/src/io/utilities/parsing_utils.cuh
@@ -335,7 +335,9 @@ __device__ __inline__ cudf::size_type* infer_integral_field_counter(char const*
   // Remove preceding zeros
   if (digit_count >= (sizeof(int64_max_abs) - 1)) {
     // Trim zeros at the beginning of raw_data
-    while (*data_begin == '0' && (data_begin < data_end)) { data_begin++; }
+    while (*data_begin == '0' && (data_begin < data_end)) {
+      data_begin++;
+    }
   }
   digit_count = data_end - data_begin;
 
diff --git a/cpp/src/io/utilities/trie.cu b/cpp/src/io/utilities/trie.cu
index 82d8f5e8336..bf03d6a6a89 100644
--- a/cpp/src/io/utilities/trie.cu
+++ b/cpp/src/io/utilities/trie.cu
@@ -33,7 +33,7 @@
 namespace cudf {
 namespace detail {
 
-rmm::device_uvector<serial_trie_node> create_serialized_trie(const std::vector<std::string> &keys,
+rmm::device_uvector<serial_trie_node> create_serialized_trie(const std::vector<std::string>& keys,
                                                              rmm::cuda_stream_view stream)
 {
   static constexpr int alphabet_size = std::numeric_limits<char>::max() + 1;
@@ -47,8 +47,8 @@ rmm::device_uvector<serial_trie_node> create_serialized_trie(const std::vector<s
   // The trie takes a lot of memory, but the lookup is fast:
   // allows direct addressing of children nodes
   TreeTrieNode tree_trie;
-  for (const auto &key : keys) {
-    auto *current_node = &tree_trie;
+  for (const auto& key : keys) {
+    auto* current_node = &tree_trie;
 
     for (const char character : key) {
       if (current_node->children[character] == nullptr)
@@ -61,9 +61,9 @@ rmm::device_uvector<serial_trie_node> create_serialized_trie(const std::vector<s
   }
 
   struct IndexedTrieNode {
-    TreeTrieNode const *const pnode;
+    TreeTrieNode const* const pnode;
     int16_t const idx;
-    IndexedTrieNode(TreeTrieNode const *const node, int16_t index) : pnode(node), idx(index) {}
+    IndexedTrieNode(TreeTrieNode const* const node, int16_t index) : pnode(node), idx(index) {}
   };
 
   // Serialize the tree trie
diff --git a/cpp/src/io/utilities/type_conversion.cuh b/cpp/src/io/utilities/type_conversion.cuh
index 86c55f3c27f..40be33ed0fb 100644
--- a/cpp/src/io/utilities/type_conversion.cuh
+++ b/cpp/src/io/utilities/type_conversion.cuh
@@ -25,7 +25,7 @@ namespace io {
  *
  * @return data_type The converted data type
  */
-data_type convert_string_to_dtype(const std::string &dtype);
+data_type convert_string_to_dtype(const std::string& dtype);
 
 }  // namespace io
 }  // namespace cudf
diff --git a/cpp/src/jit/parser.cpp b/cpp/src/jit/parser.cpp
index 8929d58be08..f305fc096e2 100644
--- a/cpp/src/jit/parser.cpp
+++ b/cpp/src/jit/parser.cpp
@@ -135,7 +135,9 @@ std::string ptx_parser::parse_instruction(const std::string& src)
       break;
     }
     if (src[start] == '[') {
-      while (stop < length && src[stop] != ']') { stop++; }
+      while (stop < length && src[stop] != ']') {
+        stop++;
+      }
       stop++;
     } else {
       while (stop < length && !is_white(src[stop]) && src[stop] != ',' && src[stop] != ':') {
@@ -383,18 +385,24 @@ std::string parse_single_function_cuda(const std::string& src, const std::string
   size_t start        = 0;
   size_t stop         = start;
 
-  while (stop < length && no_comments[stop] != '(') { stop++; }
+  while (stop < length && no_comments[stop] != '(') {
+    stop++;
+  }
   CUDF_EXPECTS(stop != length && stop != 0,
                "No CUDA device function found in the input CUDA code.\n");
 
   stop--;
 
-  while (stop > 0 && is_white(no_comments[stop])) { stop--; }
+  while (stop > 0 && is_white(no_comments[stop])) {
+    stop--;
+  }
   CUDF_EXPECTS(stop != 0 || !is_white(no_comments[0]),
                "No CUDA device function name found in the input CUDA code.\n");
 
   start = stop;
-  while (start > 0 && !is_white(no_comments[start])) { start--; }
+  while (start > 0 && !is_white(no_comments[start])) {
+    start--;
+  }
   start++;
   stop++;
   CUDF_EXPECTS(start < stop, "No CUDA device function name found in the input CUDA code.\n");
diff --git a/cpp/src/jit/parser.hpp b/cpp/src/jit/parser.hpp
index 61228d7ffce..0b752d77d1f 100644
--- a/cpp/src/jit/parser.hpp
+++ b/cpp/src/jit/parser.hpp
@@ -106,7 +106,7 @@ class ptx_parser {
   std::vector<std::string> parse_function_body(const std::string& src);
 
   /**
-   * @brief Remove leading white chractors and call `parse_instruction`.
+   * @brief Remove leading white characters and call `parse_instruction`.
    *
    * @param src The statement to be parsed.
    * @return The resulting CUDA statement.
@@ -124,8 +124,8 @@ class ptx_parser {
    *
    *    ---> asm volatile ("  fma.rn.f32 _f4, _f3, _f1, _f2;");
    *
-   * If a regiter from the input parameters list is used in an instruction
-   * its type is inferred from the intruction and saved in the `input_arg_list`
+   * If a register from the input parameters list is used in an instruction
+   * its type is inferred from the instruction and saved in the `input_arg_list`
    * to be used in when parsing the function header.
    *
    * See the document at https://github.com/hummingtree/cudf/wiki/PTX-parser
diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu
index dfe3231e897..e6110edfaa8 100644
--- a/cpp/src/join/hash_join.cu
+++ b/cpp/src/join/hash_join.cu
@@ -33,14 +33,14 @@ namespace cudf {
 namespace detail {
 
 std::pair<std::unique_ptr<table>, std::unique_ptr<table>> get_empty_joined_table(
-  table_view const &probe, table_view const &build)
+  table_view const& probe, table_view const& build)
 {
   std::unique_ptr<table> empty_probe = empty_like(probe);
   std::unique_ptr<table> empty_build = empty_like(build);
   return std::make_pair(std::move(empty_probe), std::move(empty_build));
 }
 
-VectorPair concatenate_vector_pairs(VectorPair &a, VectorPair &b, rmm::cuda_stream_view stream)
+VectorPair concatenate_vector_pairs(VectorPair& a, VectorPair& b, rmm::cuda_stream_view stream)
 {
   CUDF_EXPECTS((a.first->size() == a.second->size()),
                "Mismatch between sizes of vectors in vector pair");
@@ -90,12 +90,11 @@ struct valid_range {
  */
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
-get_left_join_indices_complement(
-  std::unique_ptr<rmm::device_uvector<size_type>> &right_indices,
-  size_type left_table_row_count,
-  size_type right_table_row_count,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource())
+get_left_join_indices_complement(std::unique_ptr<rmm::device_uvector<size_type>>& right_indices,
+                                 size_type left_table_row_count,
+                                 size_type right_table_row_count,
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource* mr)
 {
   // Get array of indices that do not appear in right_indices
 
@@ -169,8 +168,8 @@ get_left_join_indices_complement(
  *
  * @return Built hash table.
  */
-std::unique_ptr<multimap_type, std::function<void(multimap_type *)>> build_join_hash_table(
-  cudf::table_view const &build, null_equality compare_nulls, rmm::cuda_stream_view stream)
+std::unique_ptr<multimap_type, std::function<void(multimap_type*)>> build_join_hash_table(
+  cudf::table_view const& build, null_equality compare_nulls, rmm::cuda_stream_view stream)
 {
   auto build_device_table = cudf::table_device_view::create(build, stream);
 
@@ -198,7 +197,7 @@ std::unique_ptr<multimap_type, std::function<void(multimap_type *)>> build_join_
     *hash_table,
     hash_build,
     build_table_num_rows,
-    static_cast<bitmask_type const *>(row_bitmask.data()),
+    static_cast<bitmask_type const*>(row_bitmask.data()),
     failure.data());
   // Check error code from the kernel
   if (failure.value(stream) == 1) { CUDF_FAIL("Hash Table insert failure."); }
@@ -228,11 +227,11 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
 probe_join_hash_table(cudf::table_device_view build_table,
                       cudf::table_device_view probe_table,
-                      multimap_type const &hash_table,
+                      multimap_type const& hash_table,
                       null_equality compare_nulls,
                       std::optional<std::size_t> output_size,
                       rmm::cuda_stream_view stream,
-                      rmm::mr::device_memory_resource *mr)
+                      rmm::mr::device_memory_resource* mr)
 {
   // Use the output size directly if provided. Otherwise, compute the exact output size
   constexpr cudf::detail::join_kind ProbeJoinKind = (JoinKind == cudf::detail::join_kind::FULL_JOIN)
@@ -308,10 +307,10 @@ probe_join_hash_table(cudf::table_device_view build_table,
  */
 std::size_t get_full_join_size(cudf::table_device_view build_table,
                                cudf::table_device_view probe_table,
-                               multimap_type const &hash_table,
+                               multimap_type const& hash_table,
                                null_equality compare_nulls,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource *mr)
+                               rmm::mr::device_memory_resource* mr)
 {
   std::size_t join_size = compute_join_output_size<cudf::detail::join_kind::LEFT_JOIN>(
     build_table, probe_table, hash_table, compare_nulls, stream);
@@ -342,7 +341,7 @@ std::size_t get_full_join_size(cudf::table_device_view build_table,
                                                                              right_indices->data(),
                                                                              write_index.data(),
                                                                              join_size);
-  // Rlease intermediate memory alloation
+  // Release intermediate memory allocation
   left_indices->resize(0, stream);
 
   auto const left_table_row_count  = probe_table.num_rows();
@@ -383,8 +382,8 @@ std::size_t get_full_join_size(cudf::table_device_view build_table,
   return join_size + left_join_complement_size;
 }
 
-std::unique_ptr<cudf::table> combine_table_pair(std::unique_ptr<cudf::table> &&left,
-                                                std::unique_ptr<cudf::table> &&right)
+std::unique_ptr<cudf::table> combine_table_pair(std::unique_ptr<cudf::table>&& left,
+                                                std::unique_ptr<cudf::table>&& right)
 {
   auto joined_cols = left->release();
   auto right_cols  = right->release();
@@ -398,7 +397,7 @@ std::unique_ptr<cudf::table> combine_table_pair(std::unique_ptr<cudf::table> &&l
 
 hash_join::hash_join_impl::~hash_join_impl() = default;
 
-hash_join::hash_join_impl::hash_join_impl(cudf::table_view const &build,
+hash_join::hash_join_impl::hash_join_impl(cudf::table_view const& build,
                                           null_equality compare_nulls,
                                           rmm::cuda_stream_view stream)
   : _hash_table(nullptr)
@@ -421,11 +420,11 @@ hash_join::hash_join_impl::hash_join_impl(cudf::table_view const &build,
 
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
-hash_join::hash_join_impl::inner_join(cudf::table_view const &probe,
+hash_join::hash_join_impl::inner_join(cudf::table_view const& probe,
                                       null_equality compare_nulls,
                                       std::optional<std::size_t> output_size,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource *mr) const
+                                      rmm::mr::device_memory_resource* mr) const
 {
   CUDF_FUNC_RANGE();
   return compute_hash_join<cudf::detail::join_kind::INNER_JOIN>(
@@ -434,11 +433,11 @@ hash_join::hash_join_impl::inner_join(cudf::table_view const &probe,
 
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
-hash_join::hash_join_impl::left_join(cudf::table_view const &probe,
+hash_join::hash_join_impl::left_join(cudf::table_view const& probe,
                                      null_equality compare_nulls,
                                      std::optional<std::size_t> output_size,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource *mr) const
+                                     rmm::mr::device_memory_resource* mr) const
 {
   CUDF_FUNC_RANGE();
   return compute_hash_join<cudf::detail::join_kind::LEFT_JOIN>(
@@ -447,18 +446,18 @@ hash_join::hash_join_impl::left_join(cudf::table_view const &probe,
 
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
-hash_join::hash_join_impl::full_join(cudf::table_view const &probe,
+hash_join::hash_join_impl::full_join(cudf::table_view const& probe,
                                      null_equality compare_nulls,
                                      std::optional<std::size_t> output_size,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource *mr) const
+                                     rmm::mr::device_memory_resource* mr) const
 {
   CUDF_FUNC_RANGE();
   return compute_hash_join<cudf::detail::join_kind::FULL_JOIN>(
     probe, compare_nulls, output_size, stream, mr);
 }
 
-std::size_t hash_join::hash_join_impl::inner_join_size(cudf::table_view const &probe,
+std::size_t hash_join::hash_join_impl::inner_join_size(cudf::table_view const& probe,
                                                        null_equality compare_nulls,
                                                        rmm::cuda_stream_view stream) const
 {
@@ -472,7 +471,7 @@ std::size_t hash_join::hash_join_impl::inner_join_size(cudf::table_view const &p
     *build_table, *probe_table, *_hash_table, compare_nulls, stream);
 }
 
-std::size_t hash_join::hash_join_impl::left_join_size(cudf::table_view const &probe,
+std::size_t hash_join::hash_join_impl::left_join_size(cudf::table_view const& probe,
                                                       null_equality compare_nulls,
                                                       rmm::cuda_stream_view stream) const
 {
@@ -488,10 +487,10 @@ std::size_t hash_join::hash_join_impl::left_join_size(cudf::table_view const &pr
     *build_table, *probe_table, *_hash_table, compare_nulls, stream);
 }
 
-std::size_t hash_join::hash_join_impl::full_join_size(cudf::table_view const &probe,
+std::size_t hash_join::hash_join_impl::full_join_size(cudf::table_view const& probe,
                                                       null_equality compare_nulls,
                                                       rmm::cuda_stream_view stream,
-                                                      rmm::mr::device_memory_resource *mr) const
+                                                      rmm::mr::device_memory_resource* mr) const
 {
   CUDF_FUNC_RANGE();
 
@@ -507,11 +506,11 @@ std::size_t hash_join::hash_join_impl::full_join_size(cudf::table_view const &pr
 template <cudf::detail::join_kind JoinKind>
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
-hash_join::hash_join_impl::compute_hash_join(cudf::table_view const &probe,
+hash_join::hash_join_impl::compute_hash_join(cudf::table_view const& probe,
                                              null_equality compare_nulls,
                                              std::optional<std::size_t> output_size,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource *mr) const
+                                             rmm::mr::device_memory_resource* mr) const
 {
   CUDF_EXPECTS(0 != probe.num_columns(), "Hash join probe table is empty");
   CUDF_EXPECTS(probe.num_rows() < cudf::detail::MAX_JOIN_SIZE,
@@ -533,7 +532,7 @@ hash_join::hash_join_impl::compute_hash_join(cudf::table_view const &probe,
                           std::cend(_build),
                           std::cbegin(flattened_probe_table),
                           std::cend(flattened_probe_table),
-                          [](const auto &b, const auto &p) { return b.type() == p.type(); }),
+                          [](const auto& b, const auto& p) { return b.type() == p.type(); }),
                "Mismatch in joining column data types");
 
   return probe_join_indices<JoinKind>(
@@ -543,11 +542,11 @@ hash_join::hash_join_impl::compute_hash_join(cudf::table_view const &probe,
 template <cudf::detail::join_kind JoinKind>
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
-hash_join::hash_join_impl::probe_join_indices(cudf::table_view const &probe,
+hash_join::hash_join_impl::probe_join_indices(cudf::table_view const& probe,
                                               null_equality compare_nulls,
                                               std::optional<std::size_t> output_size,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource *mr) const
+                                              rmm::mr::device_memory_resource* mr) const
 {
   // Trivial left join case - exit early
   if (!_hash_table && JoinKind != cudf::detail::join_kind::INNER_JOIN) {
diff --git a/cpp/src/join/hash_join.cuh b/cpp/src/join/hash_join.cuh
index f9ccbd68c74..1b4cbf4ba1d 100644
--- a/cpp/src/join/hash_join.cuh
+++ b/cpp/src/join/hash_join.cuh
@@ -153,6 +153,17 @@ std::pair<std::unique_ptr<table>, std::unique_ptr<table>> get_empty_joined_table
 std::unique_ptr<cudf::table> combine_table_pair(std::unique_ptr<cudf::table>&& left,
                                                 std::unique_ptr<cudf::table>&& right);
 
+VectorPair concatenate_vector_pairs(VectorPair& a, VectorPair& b, rmm::cuda_stream_view stream);
+
+std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+          std::unique_ptr<rmm::device_uvector<size_type>>>
+get_left_join_indices_complement(
+  std::unique_ptr<rmm::device_uvector<size_type>>& right_indices,
+  size_type left_table_row_count,
+  size_type right_table_row_count,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 }  // namespace detail
 
 struct hash_join::hash_join_impl {
diff --git a/cpp/src/join/join.cu b/cpp/src/join/join.cu
index 6cb04cadcac..cf711524f0b 100644
--- a/cpp/src/join/join.cu
+++ b/cpp/src/join/join.cu
@@ -15,6 +15,7 @@
  */
 #include <join/hash_join.cuh>
 #include <join/join_common_utils.hpp>
+#include <join/nested_loop_join.cuh>
 
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/dictionary/detail/update_keys.hpp>
@@ -219,6 +220,21 @@ std::unique_ptr<table> full_join(table_view const& left_input,
   return combine_table_pair(std::move(left_result), std::move(right_result));
 }
 
+std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+          std::unique_ptr<rmm::device_uvector<size_type>>>
+conditional_join(table_view left,
+                 table_view right,
+                 ast::expression binary_predicate,
+                 null_equality compare_nulls,
+                 join_kind JoinKind,
+                 rmm::cuda_stream_view stream,
+                 rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return get_conditional_join_indices(
+    left, right, JoinKind, binary_predicate, compare_nulls, stream, mr);
+}
+
 }  // namespace detail
 
 hash_join::~hash_join() = default;
@@ -356,4 +372,88 @@ std::unique_ptr<table> full_join(table_view const& left,
     left, right, left_on, right_on, compare_nulls, rmm::cuda_stream_default, mr);
 }
 
+std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+          std::unique_ptr<rmm::device_uvector<size_type>>>
+conditional_inner_join(table_view left,
+                       table_view right,
+                       ast::expression binary_predicate,
+                       null_equality compare_nulls,
+                       rmm::mr::device_memory_resource* mr)
+{
+  return detail::conditional_join(left,
+                                  right,
+                                  binary_predicate,
+                                  compare_nulls,
+                                  detail::join_kind::INNER_JOIN,
+                                  rmm::cuda_stream_default,
+                                  mr);
+}
+
+std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+          std::unique_ptr<rmm::device_uvector<size_type>>>
+conditional_left_join(table_view left,
+                      table_view right,
+                      ast::expression binary_predicate,
+                      null_equality compare_nulls,
+                      rmm::mr::device_memory_resource* mr)
+{
+  return detail::conditional_join(left,
+                                  right,
+                                  binary_predicate,
+                                  compare_nulls,
+                                  detail::join_kind::LEFT_JOIN,
+                                  rmm::cuda_stream_default,
+                                  mr);
+}
+
+std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+          std::unique_ptr<rmm::device_uvector<size_type>>>
+conditional_full_join(table_view left,
+                      table_view right,
+                      ast::expression binary_predicate,
+                      null_equality compare_nulls,
+                      rmm::mr::device_memory_resource* mr)
+{
+  return detail::conditional_join(left,
+                                  right,
+                                  binary_predicate,
+                                  compare_nulls,
+                                  detail::join_kind::FULL_JOIN,
+                                  rmm::cuda_stream_default,
+                                  mr);
+}
+
+std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_semi_join(
+  table_view left,
+  table_view right,
+  ast::expression binary_predicate,
+  null_equality compare_nulls,
+  rmm::mr::device_memory_resource* mr)
+{
+  return std::move(detail::conditional_join(left,
+                                            right,
+                                            binary_predicate,
+                                            compare_nulls,
+                                            detail::join_kind::LEFT_SEMI_JOIN,
+                                            rmm::cuda_stream_default,
+                                            mr)
+                     .first);
+}
+
+std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_anti_join(
+  table_view left,
+  table_view right,
+  ast::expression binary_predicate,
+  null_equality compare_nulls,
+  rmm::mr::device_memory_resource* mr)
+{
+  return std::move(detail::conditional_join(left,
+                                            right,
+                                            binary_predicate,
+                                            compare_nulls,
+                                            detail::join_kind::LEFT_ANTI_JOIN,
+                                            rmm::cuda_stream_default,
+                                            mr)
+                     .first);
+}
 }  // namespace cudf
diff --git a/cpp/src/join/join_kernels.cuh b/cpp/src/join/join_kernels.cuh
index 4298706987c..6d0810ea800 100644
--- a/cpp/src/join/join_kernels.cuh
+++ b/cpp/src/join/join_kernels.cuh
@@ -18,12 +18,18 @@
 
 #include <cstddef>
 #include <cub/cub.cuh>
+#include <cudf/ast/detail/linearizer.hpp>
+#include <cudf/ast/detail/transform.cuh>
+#include <cudf/ast/operators.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/table/table_device_view.cuh>
+#include <cudf/utilities/span.hpp>
 
 #include "join_common_utils.hpp"
 
+#include <thrust/pair.h>
+
 namespace cudf {
 namespace detail {
 /**
@@ -203,39 +209,63 @@ __global__ void compute_join_output_size(multimap_type multi_map,
  * @brief Computes the output size of joining the left table to the right table.
  *
  * This method uses a nested loop to iterate over the left and right tables and count the number of
- * matches.
+ * matches according to a boolean expression.
  *
  * @tparam block_size The number of threads per block for this kernel
+ * @tparam has_nulls Whether or not the inputs may contain nulls.
  *
  * @param[in] left_table The left table
  * @param[in] right_table The right table
  * @param[in] JoinKind The type of join to be performed
- * @param[in] check_row_equality The row equality comparator
+ * @param[in] compare_nulls Controls whether null join-key values should match or not.
+ * @param[in] plan Container of device data required to evaluate the desired expression.
  * @param[out] output_size The resulting output size
  */
-template <int block_size>
-__global__ void compute_nested_loop_join_output_size(table_device_view left_table,
+template <int block_size, bool has_nulls>
+__global__ void compute_conditional_join_output_size(table_device_view left_table,
                                                      table_device_view right_table,
                                                      join_kind JoinKind,
-                                                     row_equality check_row_equality,
+                                                     null_equality compare_nulls,
+                                                     ast::detail::device_ast_plan plan,
                                                      cudf::size_type* output_size)
 {
+  // The (required) extern storage of the shared memory array leads to
+  // conflicting declarations between different templates. The easiest
+  // workaround is to declare an arbitrary (here char) array type then cast it
+  // after the fact to the appropriate type.
+  extern __shared__ char raw_intermediate_storage[];
+  cudf::ast::detail::IntermediateDataType<has_nulls>* intermediate_storage =
+    reinterpret_cast<cudf::ast::detail::IntermediateDataType<has_nulls>*>(raw_intermediate_storage);
+  auto thread_intermediate_storage = &intermediate_storage[threadIdx.x * plan.num_intermediates];
+
   cudf::size_type thread_counter(0);
   const cudf::size_type left_start_idx = threadIdx.x + blockIdx.x * blockDim.x;
   const cudf::size_type left_stride    = blockDim.x * gridDim.x;
   const cudf::size_type left_num_rows  = left_table.num_rows();
   const cudf::size_type right_num_rows = right_table.num_rows();
 
+  auto evaluator = cudf::ast::detail::expression_evaluator<has_nulls>(
+    left_table, right_table, plan, thread_intermediate_storage, compare_nulls);
+
   for (cudf::size_type left_row_index = left_start_idx; left_row_index < left_num_rows;
        left_row_index += left_stride) {
     bool found_match = false;
     for (cudf::size_type right_row_index = 0; right_row_index < right_num_rows; right_row_index++) {
-      if (check_row_equality(left_row_index, right_row_index)) {
-        ++thread_counter;
+      auto output_dest = cudf::ast::detail::value_expression_result<bool, has_nulls>();
+      evaluator.evaluate(output_dest, left_row_index, right_row_index, 0);
+      if (output_dest.is_valid() && output_dest.value()) {
+        if ((JoinKind != join_kind::LEFT_ANTI_JOIN) &&
+            !(JoinKind == join_kind::LEFT_SEMI_JOIN && found_match)) {
+          ++thread_counter;
+        }
         found_match = true;
       }
     }
-    if ((JoinKind == join_kind::LEFT_JOIN) && (!found_match)) { ++thread_counter; }
+    if ((JoinKind == join_kind::LEFT_JOIN || JoinKind == join_kind::LEFT_ANTI_JOIN ||
+         JoinKind == join_kind::FULL_JOIN) &&
+        (!found_match)) {
+      ++thread_counter;
+    }
   }
 
   using BlockReduce = cub::BlockReduce<cudf::size_type, block_size>;
@@ -428,32 +458,35 @@ __global__ void probe_hash_table(multimap_type multi_map,
 }
 
 /**
- * @brief Performs a nested loop join to find all matching rows between the
- * left and right tables and generate the output for the desired Join
- * operation.
+ * @brief Performs a join conditioned on a predicate to find all matching rows
+ * between the left and right tables and generate the output for the desired
+ * Join operation.
  *
  * @tparam block_size The number of threads per block for this kernel
  * @tparam output_cache_size The side of the shared memory buffer to cache join
  * output results
-
+ * @tparam has_nulls Whether or not the inputs may contain nulls.
+ *
  * @param[in] left_table The left table
  * @param[in] right_table The right table
  * @param[in] JoinKind The type of join to be performed
- * @param[in] check_row_equality The row equality comparator
+ * @param compare_nulls Controls whether null join-key values should match or not.
  * @param[out] join_output_l The left result of the join operation
  * @param[out] join_output_r The right result of the join operation
  * @param[in,out] current_idx A global counter used by threads to coordinate
  * writes to the global output
+ * @param plan Container of device data required to evaluate the desired expression.
  * @param[in] max_size The maximum size of the output
  */
-template <cudf::size_type block_size, cudf::size_type output_cache_size>
-__global__ void nested_loop_join(table_device_view left_table,
+template <cudf::size_type block_size, cudf::size_type output_cache_size, bool has_nulls>
+__global__ void conditional_join(table_device_view left_table,
                                  table_device_view right_table,
                                  join_kind JoinKind,
-                                 row_equality check_row_equality,
+                                 null_equality compare_nulls,
                                  cudf::size_type* join_output_l,
                                  cudf::size_type* join_output_r,
                                  cudf::size_type* current_idx,
+                                 cudf::ast::detail::device_ast_plan plan,
                                  const cudf::size_type max_size)
 {
   constexpr int num_warps = block_size / detail::warp_size;
@@ -461,6 +494,15 @@ __global__ void nested_loop_join(table_device_view left_table,
   __shared__ cudf::size_type join_shared_l[num_warps][output_cache_size];
   __shared__ cudf::size_type join_shared_r[num_warps][output_cache_size];
 
+  // Normally the casting of a shared memory array is used to create multiple
+  // arrays of different types from the shared memory buffer, but here it is
+  // used to circumvent conflicts between arrays of different types between
+  // different template instantiations due to the extern specifier.
+  extern __shared__ char raw_intermediate_storage[];
+  cudf::ast::detail::IntermediateDataType<has_nulls>* intermediate_storage =
+    reinterpret_cast<cudf::ast::detail::IntermediateDataType<has_nulls>*>(raw_intermediate_storage);
+  auto thread_intermediate_storage = &intermediate_storage[threadIdx.x * plan.num_intermediates];
+
   const int warp_id                    = threadIdx.x / detail::warp_size;
   const int lane_id                    = threadIdx.x % detail::warp_size;
   const cudf::size_type left_num_rows  = left_table.num_rows();
@@ -473,18 +515,34 @@ __global__ void nested_loop_join(table_device_view left_table,
   cudf::size_type left_row_index = threadIdx.x + blockIdx.x * blockDim.x;
 
   const unsigned int activemask = __ballot_sync(0xffffffff, left_row_index < left_num_rows);
+
+  auto evaluator = cudf::ast::detail::expression_evaluator<has_nulls>(
+    left_table, right_table, plan, thread_intermediate_storage, compare_nulls);
+
   if (left_row_index < left_num_rows) {
     bool found_match = false;
-    for (size_type right_row_index(0); right_row_index < right_num_rows; right_row_index++) {
-      if (check_row_equality(left_row_index, right_row_index)) {
+    for (size_type right_row_index(0); right_row_index < right_num_rows; ++right_row_index) {
+      auto output_dest = cudf::ast::detail::value_expression_result<bool, has_nulls>();
+      evaluator.evaluate(output_dest, left_row_index, right_row_index, 0);
+
+      if (output_dest.is_valid() && output_dest.value()) {
         // If the rows are equal, then we have found a true match
+        // In the case of left anti joins we only add indices from left after
+        // the loop if we have found _no_ matches from the right.
+        // In the case of left semi joins we only add the first match (note
+        // that the current logic relies on the fact that we process all right
+        // table rows for a single left table row on a single thread so that no
+        // synchronization of found_match is required).
+        if ((JoinKind != join_kind::LEFT_ANTI_JOIN) &&
+            !(JoinKind == join_kind::LEFT_SEMI_JOIN && found_match)) {
+          add_pair_to_cache(left_row_index,
+                            right_row_index,
+                            current_idx_shared,
+                            warp_id,
+                            join_shared_l[warp_id],
+                            join_shared_r[warp_id]);
+        }
         found_match = true;
-        add_pair_to_cache(left_row_index,
-                          right_row_index,
-                          current_idx_shared,
-                          warp_id,
-                          join_shared_l[warp_id],
-                          join_shared_r[warp_id]);
       }
 
       __syncwarp(activemask);
@@ -506,8 +564,11 @@ __global__ void nested_loop_join(table_device_view left_table,
       }
     }
 
-    // If performing a LEFT join and no match was found, insert a Null into the output
-    if ((JoinKind == join_kind::LEFT_JOIN) && (!found_match)) {
+    // Left, left anti, and full joins all require saving left columns that
+    // aren't present in the right.
+    if ((JoinKind == join_kind::LEFT_JOIN || JoinKind == join_kind::LEFT_ANTI_JOIN ||
+         JoinKind == join_kind::FULL_JOIN) &&
+        (!found_match)) {
       add_pair_to_cache(left_row_index,
                         static_cast<cudf::size_type>(JoinNoneValue),
                         current_idx_shared,
diff --git a/cpp/src/join/nested_loop_join.cuh b/cpp/src/join/nested_loop_join.cuh
index 5054305a41a..9848477a894 100644
--- a/cpp/src/join/nested_loop_join.cuh
+++ b/cpp/src/join/nested_loop_join.cuh
@@ -19,7 +19,8 @@
 #include "join_common_utils.hpp"
 #include "join_kernels.cuh"
 
-#include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/ast/detail/transform.cuh>
+#include <cudf/ast/nodes.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/scalar/scalar_device_view.cuh>
 #include <cudf/table/table.hpp>
@@ -28,167 +29,153 @@
 #include <cudf/types.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
 
-#include <iostream>
+#include <thrust/optional.h>
+
+#include <algorithm>
 
 namespace cudf {
 namespace detail {
+
 /**
- * @brief Gives an estimate of the size of the join output produced when
- * joining two tables together.
- *
- * @throw cudf::logic_error if JoinKind is not INNER_JOIN or LEFT_JOIN
+ * @brief Computes the join operation between two tables and returns the
+ * output indices of left and right table as a combined table
  *
- * @param left The left hand table
- * @param right The right hand table
+ * @param left  Table of left columns to join
+ * @param right Table of right  columns to join
+ * tables have been flipped, meaning the output indices should also be flipped
  * @param JoinKind The type of join to be performed
  * @param compare_nulls Controls whether null join-key values should match or not.
  * @param stream CUDA stream used for device memory operations and kernel launches
  *
- * @return An estimate of the size of the output of the join operation
+ * @return Join output indices vector pair
  */
-size_type estimate_nested_loop_join_output_size(table_device_view left,
-                                                table_device_view right,
-                                                join_kind JoinKind,
-                                                null_equality compare_nulls,
-                                                rmm::cuda_stream_view stream)
+std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+          std::unique_ptr<rmm::device_uvector<size_type>>>
+get_conditional_join_indices(table_view const& left,
+                             table_view const& right,
+                             join_kind JoinKind,
+                             ast::expression binary_pred,
+                             null_equality compare_nulls,
+                             rmm::cuda_stream_view stream,
+                             rmm::mr::device_memory_resource* mr)
 {
-  const size_type left_num_rows{left.num_rows()};
-  const size_type right_num_rows{right.num_rows()};
-
-  if (right_num_rows == 0) {
-    // If the right table is empty, we know exactly how large the output
-    // will be for the different types of joins and can return immediately
+  // We can immediately filter out cases where the right table is empty. In
+  // some cases, we return all the rows of the left table with a corresponding
+  // null index for the right table; in others, we return an empty output.
+  if (right.num_rows() == 0) {
     switch (JoinKind) {
-      // Inner join with an empty table will have no output
-      case join_kind::INNER_JOIN: return 0;
-
-      // Left join with an empty table will have an output of NULL rows
-      // equal to the number of rows in the left table
-      case join_kind::LEFT_JOIN: return left_num_rows;
-
-      default: CUDF_FAIL("Unsupported join type");
+      // Left, left anti, and full (which are effectively left because we are
+      // guaranteed that left has more rows than right) all return a all the
+      // row indices from left with a corresponding NULL from the right.
+      case join_kind::LEFT_JOIN:
+      case join_kind::LEFT_ANTI_JOIN:
+      case join_kind::FULL_JOIN: return get_trivial_left_join_indices(left, stream);
+      // Inner and left semi joins return empty output because no matches can exist.
+      case join_kind::INNER_JOIN:
+      case join_kind::LEFT_SEMI_JOIN:
+        return std::make_pair(std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr),
+                              std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
     }
   }
 
-  // Allocate storage for the counter used to get the size of the join output
-  size_type h_size_estimate{0};
-  rmm::device_scalar<size_type> size_estimate(0, stream);
+  // Prepare output column. Whether or not the output column is nullable is
+  // determined by whether any of the columns in the input table are nullable.
+  // If none of the input columns actually contain nulls, we can still use the
+  // non-nullable version of the expression evaluation code path for
+  // performance, so we capture that information as well.
+  auto const nullable =
+    std::any_of(left.begin(), left.end(), [](column_view c) { return c.nullable(); }) ||
+    std::any_of(right.begin(), right.end(), [](column_view c) { return c.nullable(); });
+  auto const has_nulls =
+    nullable &&
+    (std::any_of(
+       left.begin(), left.end(), [](column_view c) { return c.nullable() && c.has_nulls(); }) ||
+     std::any_of(
+       right.begin(), right.end(), [](column_view c) { return c.nullable() && c.has_nulls(); }));
+
+  auto const plan = ast::detail::ast_plan{binary_pred, left, right, has_nulls, stream, mr};
+  CUDF_EXPECTS(plan.output_type().id() == type_id::BOOL8,
+               "The expression must produce a boolean output.");
 
-  CHECK_CUDA(stream.value());
+  auto left_table  = table_device_view::create(left, stream);
+  auto right_table = table_device_view::create(right, stream);
 
+  // Allocate storage for the counter used to get the size of the join output
+  rmm::device_scalar<size_type> size(0, stream, mr);
+  CHECK_CUDA(stream.value());
   constexpr int block_size{DEFAULT_JOIN_BLOCK_SIZE};
-  int numBlocks{-1};
+  detail::grid_1d config(left_table->num_rows(), block_size);
+  auto const shmem_size_per_block = plan.dev_plan.shmem_per_thread * config.num_threads_per_block;
 
-  CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-    &numBlocks, compute_nested_loop_join_output_size<block_size>, block_size, 0));
-
-  int dev_id{-1};
-  CUDA_TRY(cudaGetDevice(&dev_id));
-
-  int num_sms{-1};
-  CUDA_TRY(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, dev_id));
-
-  size_estimate.set_value_zero(stream);
-
-  row_equality equality{left, right, compare_nulls == null_equality::EQUAL};
   // Determine number of output rows without actually building the output to simply
   // find what the size of the output will be.
-  compute_nested_loop_join_output_size<block_size>
-    <<<numBlocks * num_sms, block_size, 0, stream.value()>>>(
-      left, right, JoinKind, equality, size_estimate.data());
+  join_kind KernelJoinKind = JoinKind == join_kind::FULL_JOIN ? join_kind::LEFT_JOIN : JoinKind;
+  if (has_nulls) {
+    compute_conditional_join_output_size<block_size, true>
+      <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
+        *left_table, *right_table, KernelJoinKind, compare_nulls, plan.dev_plan, size.data());
+  } else {
+    compute_conditional_join_output_size<block_size, false>
+      <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
+        *left_table, *right_table, KernelJoinKind, compare_nulls, plan.dev_plan, size.data());
+  }
   CHECK_CUDA(stream.value());
 
-  h_size_estimate = size_estimate.value(stream);
+  size_type const join_size = size.value(stream);
 
-  return h_size_estimate;
-}
-
-/**
- * @brief Computes the join operation between two tables and returns the
- * output indices of left and right table as a combined table
- *
- * @param left  Table of left columns to join
- * @param right Table of right  columns to join
- * @param flip_join_indices Flag that indicates whether the left and right
- * tables have been flipped, meaning the output indices should also be flipped
- * @param JoinKind The type of join to be performed
- * @param compare_nulls Controls whether null join-key values should match or not.
- * @param stream CUDA stream used for device memory operations and kernel launches
- *
- * @return Join output indices vector pair
- */
-std::pair<rmm::device_uvector<size_type>, rmm::device_uvector<size_type>>
-get_base_nested_loop_join_indices(table_view const& left,
-                                  table_view const& right,
-                                  bool flip_join_indices,
-                                  join_kind JoinKind,
-                                  null_equality compare_nulls,
-                                  rmm::cuda_stream_view stream)
-{
-  // The `right` table is always used for the inner loop. We want to use the smaller table
-  // for the inner loop. Thus, if `left` is smaller than `right`, swap `left/right`.
-  if ((JoinKind == join_kind::INNER_JOIN) && (right.num_rows() > left.num_rows())) {
-    return get_base_nested_loop_join_indices(right, left, true, JoinKind, compare_nulls, stream);
+  // If the output size will be zero, we can return immediately.
+  if (join_size == 0) {
+    return std::make_pair(std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr),
+                          std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
   }
-  // Trivial left join case - exit early
-  if ((JoinKind == join_kind::LEFT_JOIN) && (right.num_rows() == 0)) {
-    return get_trivial_left_join_indices(left, stream);
+
+  rmm::device_scalar<size_type> write_index(0, stream);
+
+  auto left_indices  = std::make_unique<rmm::device_uvector<size_type>>(join_size, stream, mr);
+  auto right_indices = std::make_unique<rmm::device_uvector<size_type>>(join_size, stream, mr);
+
+  const auto& join_output_l = left_indices->data();
+  const auto& join_output_r = right_indices->data();
+  if (has_nulls) {
+    conditional_join<block_size, DEFAULT_JOIN_CACHE_SIZE, true>
+      <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
+        *left_table,
+        *right_table,
+        KernelJoinKind,
+        compare_nulls,
+        join_output_l,
+        join_output_r,
+        write_index.data(),
+        plan.dev_plan,
+        join_size);
+  } else {
+    conditional_join<block_size, DEFAULT_JOIN_CACHE_SIZE, false>
+      <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
+        *left_table,
+        *right_table,
+        KernelJoinKind,
+        compare_nulls,
+        join_output_l,
+        join_output_r,
+        write_index.data(),
+        plan.dev_plan,
+        join_size);
   }
 
-  auto left_table  = table_device_view::create(left, stream);
-  auto right_table = table_device_view::create(right, stream);
+  CHECK_CUDA(stream.value());
 
-  size_type estimated_size = estimate_nested_loop_join_output_size(
-    *left_table, *right_table, JoinKind, compare_nulls, stream);
+  auto join_indices = std::make_pair(std::move(left_indices), std::move(right_indices));
 
-  // If the estimated output size is zero, return immediately
-  if (estimated_size == 0) {
-    return std::make_pair(rmm::device_uvector<size_type>{0, stream},
-                          rmm::device_uvector<size_type>{0, stream});
+  // For full joins, get the indices in the right table that were not joined to
+  // by any row in the left table.
+  if (JoinKind == join_kind::FULL_JOIN) {
+    auto complement_indices = detail::get_left_join_indices_complement(
+      join_indices.second, left.num_rows(), right.num_rows(), stream, mr);
+    join_indices = detail::concatenate_vector_pairs(join_indices, complement_indices, stream);
   }
-
-  // Because we are approximating the number of joined elements, our approximation
-  // might be incorrect and we might have underestimated the number of joined elements.
-  // As such we will need to de-allocate memory and re-allocate memory to ensure
-  // that the final output is correct.
-  rmm::device_scalar<size_type> write_index(0, stream);
-  size_type join_size{0};
-
-  rmm::device_uvector<size_type> left_indices{0, stream};
-  rmm::device_uvector<size_type> right_indices{0, stream};
-  auto current_estimated_size = estimated_size;
-  do {
-    left_indices.resize(estimated_size, stream);
-    right_indices.resize(estimated_size, stream);
-
-    constexpr int block_size{DEFAULT_JOIN_BLOCK_SIZE};
-    detail::grid_1d config(left_table->num_rows(), block_size);
-    write_index.set_value_zero(stream);
-
-    row_equality equality{*left_table, *right_table, compare_nulls == null_equality::EQUAL};
-    const auto& join_output_l = flip_join_indices ? right_indices.data() : left_indices.data();
-    const auto& join_output_r = flip_join_indices ? left_indices.data() : right_indices.data();
-    nested_loop_join<block_size, DEFAULT_JOIN_CACHE_SIZE>
-      <<<config.num_blocks, config.num_threads_per_block, 0, stream.value()>>>(*left_table,
-                                                                               *right_table,
-                                                                               JoinKind,
-                                                                               equality,
-                                                                               join_output_l,
-                                                                               join_output_r,
-                                                                               write_index.data(),
-                                                                               estimated_size);
-
-    CHECK_CUDA(stream.value());
-
-    join_size              = write_index.value(stream);
-    current_estimated_size = estimated_size;
-    estimated_size *= 2;
-  } while ((current_estimated_size < join_size));
-
-  left_indices.resize(join_size, stream);
-  right_indices.resize(join_size, stream);
-  return std::make_pair(std::move(left_indices), std::move(right_indices));
+  return join_indices;
 }
 
 }  // namespace detail
diff --git a/cpp/src/lists/copying/scatter_helper.cu b/cpp/src/lists/copying/scatter_helper.cu
index c57327569a4..d451540deb6 100644
--- a/cpp/src/lists/copying/scatter_helper.cu
+++ b/cpp/src/lists/copying/scatter_helper.cu
@@ -30,16 +30,6 @@ namespace cudf {
 namespace lists {
 namespace detail {
 
-void assert_same_data_type(column_view const& lhs, column_view const& rhs)
-{
-  CUDF_EXPECTS(lhs.type().id() == rhs.type().id(), "Mismatched Data types.");
-  // Empty string column has no children
-  CUDF_EXPECTS(lhs.type().id() == type_id::STRING or lhs.num_children() == rhs.num_children(),
-               "Mismatched number of child columns.");
-
-  for (int i{0}; i < lhs.num_children(); ++i) { assert_same_data_type(lhs.child(i), rhs.child(i)); }
-}
-
 /**
  * @brief Constructs null mask for a scattered list's child column
  *
diff --git a/cpp/src/lists/explode.cu b/cpp/src/lists/explode.cu
index 3ce0f91fd71..55a6523ebdd 100644
--- a/cpp/src/lists/explode.cu
+++ b/cpp/src/lists/explode.cu
@@ -251,9 +251,9 @@ std::unique_ptr<table> explode_outer(table_view const& input_table,
       }
     }
     if (null_or_empty[idx]) {
-      auto invalid_index = null_or_empty_offset_p[idx] == 0
-                             ? offsets[idx]
-                             : offsets[idx] + null_or_empty_offset_p[idx] - 1;
+      auto invalid_index          = null_or_empty_offset_p[idx] == 0
+                                      ? offsets[idx]
+                                      : offsets[idx] + null_or_empty_offset_p[idx] - 1;
       gather_map_p[invalid_index] = idx;
 
       explode_col_gather_map_p[invalid_index] = InvalidIndex;
diff --git a/cpp/src/reductions/minmax.cu b/cpp/src/reductions/minmax.cu
index c99b366c2dd..5baef2c7639 100644
--- a/cpp/src/reductions/minmax.cu
+++ b/cpp/src/reductions/minmax.cu
@@ -103,7 +103,7 @@ rmm::device_scalar<OutputType> reduce_device(InputIterator d_in,
 template <typename T>
 struct minmax_binary_op
   : public thrust::binary_function<minmax_pair<T>, minmax_pair<T>, minmax_pair<T>> {
-  __device__ minmax_pair<T> operator()(minmax_pair<T> const &lhs, minmax_pair<T> const &rhs) const
+  __device__ minmax_pair<T> operator()(minmax_pair<T> const& lhs, minmax_pair<T> const& rhs) const
   {
     return minmax_pair<T>{thrust::min(lhs.min_val, rhs.min_val),
                           thrust::max(lhs.max_val, rhs.max_val)};
@@ -148,7 +148,7 @@ struct minmax_functor {
   }
 
   template <typename T>
-  auto reduce(column_view const &col, rmm::cuda_stream_view stream)
+  auto reduce(column_view const& col, rmm::cuda_stream_view stream)
   {
     auto device_col = column_device_view::create(col, stream);
     // compute minimum and maximum values
@@ -174,16 +174,16 @@ struct minmax_functor {
       *max_data = result->max_val;
     }
 
-    ResultType *result;
-    T *min_data;
-    T *max_data;
+    ResultType* result;
+    T* min_data;
+    T* max_data;
   };
 
   template <typename T,
             std::enable_if_t<is_supported<T>() and !std::is_same<T, cudf::string_view>::value and
-                             !cudf::is_dictionary<T>()> * = nullptr>
+                             !cudf::is_dictionary<T>()>* = nullptr>
   std::pair<std::unique_ptr<scalar>, std::unique_ptr<scalar>> operator()(
-    cudf::column_view const &col, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr)
+    cudf::column_view const& col, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
   {
     // compute minimum and maximum values
     auto dev_result = reduce<T>(col, stream);
@@ -200,9 +200,9 @@ struct minmax_functor {
   /**
    * @brief Specialization for strings column.
    */
-  template <typename T, std::enable_if_t<std::is_same<T, cudf::string_view>::value> * = nullptr>
+  template <typename T, std::enable_if_t<std::is_same<T, cudf::string_view>::value>* = nullptr>
   std::pair<std::unique_ptr<scalar>, std::unique_ptr<scalar>> operator()(
-    cudf::column_view const &col, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr)
+    cudf::column_view const& col, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
   {
     // compute minimum and maximum values
     auto dev_result = reduce<cudf::string_view>(col, stream);
@@ -219,9 +219,9 @@ struct minmax_functor {
   /**
    * @brief Specialization for dictionary column.
    */
-  template <typename T, std::enable_if_t<cudf::is_dictionary<T>()> * = nullptr>
+  template <typename T, std::enable_if_t<cudf::is_dictionary<T>()>* = nullptr>
   std::pair<std::unique_ptr<scalar>, std::unique_ptr<scalar>> operator()(
-    cudf::column_view const &col, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr)
+    cudf::column_view const& col, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
   {
     // compute minimum and maximum values
     auto dev_result = reduce<T>(col, stream);
@@ -236,9 +236,9 @@ struct minmax_functor {
             get_element(keys, static_cast<size_type>(host_result.max_val), stream, mr)};
   }
 
-  template <typename T, std::enable_if_t<!is_supported<T>()> * = nullptr>
+  template <typename T, std::enable_if_t<!is_supported<T>()>* = nullptr>
   std::pair<std::unique_ptr<scalar>, std::unique_ptr<scalar>> operator()(
-    cudf::column_view const &, rmm::cuda_stream_view, rmm::mr::device_memory_resource *)
+    cudf::column_view const&, rmm::cuda_stream_view, rmm::mr::device_memory_resource*)
   {
     CUDF_FAIL("type not supported for minmax() operation");
   }
@@ -247,7 +247,7 @@ struct minmax_functor {
 }  // namespace
 
 std::pair<std::unique_ptr<scalar>, std::unique_ptr<scalar>> minmax(
-  cudf::column_view const &col, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr)
+  cudf::column_view const& col, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
 {
   if (col.null_count() == col.size()) {
     // this handles empty and all-null columns
@@ -264,7 +264,7 @@ std::pair<std::unique_ptr<scalar>, std::unique_ptr<scalar>> minmax(
  * @copydoc cudf::minmax
  */
 std::pair<std::unique_ptr<scalar>, std::unique_ptr<scalar>> minmax(
-  const column_view &col, rmm::mr::device_memory_resource *mr)
+  const column_view& col, rmm::mr::device_memory_resource* mr)
 {
   return detail::minmax(col, rmm::cuda_stream_default, mr);
 }
diff --git a/cpp/src/reductions/reductions.cpp b/cpp/src/reductions/reductions.cpp
index 00539b6d7a5..a8117373ca4 100644
--- a/cpp/src/reductions/reductions.cpp
+++ b/cpp/src/reductions/reductions.cpp
@@ -32,19 +32,19 @@ namespace detail {
 struct reduce_dispatch_functor {
   column_view const col;
   data_type output_dtype;
-  rmm::mr::device_memory_resource *mr;
+  rmm::mr::device_memory_resource* mr;
   rmm::cuda_stream_view stream;
 
-  reduce_dispatch_functor(column_view const &col,
+  reduce_dispatch_functor(column_view const& col,
                           data_type output_dtype,
                           rmm::cuda_stream_view stream,
-                          rmm::mr::device_memory_resource *mr)
+                          rmm::mr::device_memory_resource* mr)
     : col(col), output_dtype(output_dtype), mr(mr), stream(stream)
   {
   }
 
   template <aggregation::Kind k>
-  std::unique_ptr<scalar> operator()(std::unique_ptr<aggregation> const &agg)
+  std::unique_ptr<scalar> operator()(std::unique_ptr<aggregation> const& agg)
   {
     switch (k) {
       case aggregation::SUM: return reduction::sum(col, output_dtype, stream, mr); break;
@@ -58,11 +58,11 @@ struct reduce_dispatch_functor {
         break;
       case aggregation::MEAN: return reduction::mean(col, output_dtype, stream, mr); break;
       case aggregation::VARIANCE: {
-        auto var_agg = dynamic_cast<var_aggregation const *>(agg.get());
+        auto var_agg = dynamic_cast<var_aggregation const*>(agg.get());
         return reduction::variance(col, output_dtype, var_agg->_ddof, stream, mr);
       } break;
       case aggregation::STD: {
-        auto var_agg = dynamic_cast<std_aggregation const *>(agg.get());
+        auto var_agg = dynamic_cast<std_aggregation const*>(agg.get());
         return reduction::standard_deviation(col, output_dtype, var_agg->_ddof, stream, mr);
       } break;
       case aggregation::MEDIAN: {
@@ -73,7 +73,7 @@ struct reduce_dispatch_functor {
         return get_element(*col_ptr, 0, stream, mr);
       } break;
       case aggregation::QUANTILE: {
-        auto quantile_agg = dynamic_cast<quantile_aggregation const *>(agg.get());
+        auto quantile_agg = dynamic_cast<quantile_aggregation const*>(agg.get());
         CUDF_EXPECTS(quantile_agg->_quantiles.size() == 1,
                      "Reduction quantile accepts only one quantile value");
         auto sorted_indices = sorted_order(table_view{{col}}, {}, {null_order::AFTER}, stream, mr);
@@ -89,7 +89,7 @@ struct reduce_dispatch_functor {
         return get_element(*col_ptr, 0, stream, mr);
       } break;
       case aggregation::NUNIQUE: {
-        auto nunique_agg = dynamic_cast<nunique_aggregation const *>(agg.get());
+        auto nunique_agg = dynamic_cast<nunique_aggregation const*>(agg.get());
         return make_fixed_width_scalar(
           detail::distinct_count(
             col, nunique_agg->_null_handling, nan_policy::NAN_IS_VALID, stream),
@@ -97,7 +97,7 @@ struct reduce_dispatch_functor {
           mr);
       } break;
       case aggregation::NTH_ELEMENT: {
-        auto nth_agg = dynamic_cast<nth_element_aggregation const *>(agg.get());
+        auto nth_agg = dynamic_cast<nth_element_aggregation const*>(agg.get());
         return reduction::nth_element(col, nth_agg->_n, nth_agg->_null_handling, stream, mr);
       } break;
       default: CUDF_FAIL("Unsupported reduction operator");
@@ -106,11 +106,11 @@ struct reduce_dispatch_functor {
 };
 
 std::unique_ptr<scalar> reduce(
-  column_view const &col,
-  std::unique_ptr<aggregation> const &agg,
+  column_view const& col,
+  std::unique_ptr<aggregation> const& agg,
   data_type output_dtype,
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
-  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource())
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   std::unique_ptr<scalar> result = make_default_constructed_scalar(output_dtype, stream, mr);
   result->set_valid_async(false, stream);
@@ -124,10 +124,10 @@ std::unique_ptr<scalar> reduce(
 }
 }  // namespace detail
 
-std::unique_ptr<scalar> reduce(column_view const &col,
-                               std::unique_ptr<aggregation> const &agg,
+std::unique_ptr<scalar> reduce(column_view const& col,
+                               std::unique_ptr<aggregation> const& agg,
                                data_type output_dtype,
-                               rmm::mr::device_memory_resource *mr)
+                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
   return detail::reduce(col, agg, output_dtype, rmm::cuda_stream_default, mr);
diff --git a/cpp/src/reductions/scan/scan_inclusive.cu b/cpp/src/reductions/scan/scan_inclusive.cu
index f729f812b28..1beb9ecb282 100644
--- a/cpp/src/reductions/scan/scan_inclusive.cu
+++ b/cpp/src/reductions/scan/scan_inclusive.cu
@@ -23,6 +23,7 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/null_mask.hpp>
 #include <cudf/reduction.hpp>
+#include <cudf/strings/detail/gather.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
@@ -67,7 +68,46 @@ rmm::device_buffer mask_scan(const column_view& input_view,
 namespace {
 
 /**
- * @brief Dispatcher for running Scan operation on input column
+ * @brief Strings inclusive scan operator
+ *
+ * This was specifically created to workaround a thrust issue
+ * https://github.com/NVIDIA/thrust/issues/1479
+ * where invalid values are passed to the operator.
+ *
+ * This operator will accept index values, check them and then
+ * run the `Op` operation on the individual string_view objects.
+ * The returned result is the appropriate index value.
+ */
+template <typename Op>
+struct string_scan_operator {
+  column_device_view const col;          ///< strings column device view
+  string_view const null_replacement{};  ///< value used when element is null
+  bool const has_nulls;                  ///< true if col has null elements
+
+  string_scan_operator(column_device_view const& col, bool has_nulls = true)
+    : col{col}, null_replacement{Op::template identity<string_view>()}, has_nulls{has_nulls}
+  {
+    CUDF_EXPECTS(type_id::STRING == col.type().id(), "the data type mismatch");
+    // verify validity bitmask is non-null, otherwise, is_null_nocheck() will crash
+    if (has_nulls) CUDF_EXPECTS(col.nullable(), "column with nulls must have a validity bitmask");
+  }
+
+  CUDA_DEVICE_CALLABLE
+  size_type operator()(size_type lhs, size_type rhs) const
+  {
+    // thrust::inclusive_scan may pass us garbage values so we need to protect ourselves;
+    // in these cases the return value does not matter since the result is not used
+    if (lhs < 0 || rhs < 0 || lhs >= col.size() || rhs >= col.size()) return 0;
+    string_view d_lhs =
+      has_nulls && col.is_null_nocheck(lhs) ? null_replacement : col.element<string_view>(lhs);
+    string_view d_rhs =
+      has_nulls && col.is_null_nocheck(rhs) ? null_replacement : col.element<string_view>(rhs);
+    return Op{}(d_lhs, d_rhs) == d_lhs ? lhs : rhs;
+  }
+};
+
+/**
+ * @brief Dispatcher for running a Scan operation on an input column
  *
  * @tparam Op device binary operator
  */
@@ -117,22 +157,25 @@ struct scan_dispatcher {
   {
     auto d_input = column_device_view::create(input_view, stream);
 
-    rmm::device_uvector<T> result(input_view.size(), stream);
-    auto begin =
-      make_null_replacement_iterator(*d_input, Op::template identity<T>(), input_view.has_nulls());
-    thrust::inclusive_scan(
-      rmm::exec_policy(stream), begin, begin + input_view.size(), result.data(), Op{});
-
-    CHECK_CUDA(stream.value());
-    return cudf::make_strings_column(result, Op::template identity<string_view>(), stream, mr);
+    // build indices of the scan operation results
+    rmm::device_uvector<size_type> result(input_view.size(), stream);
+    thrust::inclusive_scan(rmm::exec_policy(stream),
+                           thrust::counting_iterator<size_type>(0),
+                           thrust::counting_iterator<size_type>(input_view.size()),
+                           result.begin(),
+                           string_scan_operator<Op>{*d_input, input_view.has_nulls()});
+
+    // call gather using the indices to build the output column
+    return cudf::strings::detail::gather(
+      strings_column_view(input_view), result.begin(), result.end(), false, stream, mr);
   }
 
  public:
   /**
-   * @brief creates new column from input column by applying scan operation
+   * @brief Creates a new column from the input column by applying the scan operation
    *
-   * @param input     input column view
-   * @param inclusive inclusive or exclusive scan
+   * @param input Input column view
+   * @param null_handling How null row entries are to be processed
    * @param stream CUDA stream used for device memory operations and kernel launches.
    * @param mr Device memory resource used to allocate the returned column's device memory
    * @return
diff --git a/cpp/src/reshape/byte_cast.cu b/cpp/src/reshape/byte_cast.cu
index 5bbdb5988e7..98156224cfe 100644
--- a/cpp/src/reshape/byte_cast.cu
+++ b/cpp/src/reshape/byte_cast.cu
@@ -108,7 +108,7 @@ std::unique_ptr<cudf::column> byte_list_conversion::operator()<string_view>(
 }  // namespace
 
 /**
- * @copydoc cudf::byte_cast(input_column,flip_endianess,rmm::mr::device_memory_resource)
+ * @copydoc cudf::byte_cast(input_column,flip_endianness,rmm::mr::device_memory_resource)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -124,7 +124,7 @@ std::unique_ptr<column> byte_cast(column_view const& input_column,
 }  // namespace detail
 
 /**
- * @copydoc cudf::byte_cast(input_column,flip_endianess,rmm::mr::device_memory_resource)
+ * @copydoc cudf::byte_cast(input_column,flip_endianness,rmm::mr::device_memory_resource)
  */
 std::unique_ptr<column> byte_cast(column_view const& input_column,
                                   flip_endianness endian_configuration,
diff --git a/cpp/src/reshape/tile.cu b/cpp/src/reshape/tile.cu
index 2f19c8158c5..fa12fabffdc 100644
--- a/cpp/src/reshape/tile.cu
+++ b/cpp/src/reshape/tile.cu
@@ -40,10 +40,10 @@ struct tile_functor {
 }  // anonymous namespace
 
 namespace detail {
-std::unique_ptr<table> tile(const table_view &in,
+std::unique_ptr<table> tile(const table_view& in,
                             size_type count,
                             rmm::cuda_stream_view stream,
-                            rmm::mr::device_memory_resource *mr)
+                            rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(count >= 0, "Count cannot be negative");
 
@@ -59,9 +59,9 @@ std::unique_ptr<table> tile(const table_view &in,
 }
 }  // namespace detail
 
-std::unique_ptr<table> tile(const table_view &in,
+std::unique_ptr<table> tile(const table_view& in,
                             size_type count,
-                            rmm::mr::device_memory_resource *mr)
+                            rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
   return detail::tile(in, count, rmm::cuda_stream_default, mr);
diff --git a/cpp/src/rolling/rolling_detail.cuh b/cpp/src/rolling/rolling_detail.cuh
index d7114608787..862e44a0d2b 100644
--- a/cpp/src/rolling/rolling_detail.cuh
+++ b/cpp/src/rolling/rolling_detail.cuh
@@ -339,8 +339,8 @@ std::unique_ptr<column> empty_output_for_rolling_aggregation(column_view const&
   // TODO:
   //  Ideally, for UDF aggregations, the returned column would match
   //  the agg's return type. It currently returns empty_like(input), because:
-  //    1. This preserves prior behaviour for empty input columns.
-  //    2. There is insufficient information to construct nested return colums.
+  //    1. This preserves prior behavior for empty input columns.
+  //    2. There is insufficient information to construct nested return columns.
   //       `cudf::make_udf_aggregation()` expresses the return type as a `data_type`
   //        which cannot express recursively nested types (e.g. `STRUCT<LIST<INT32>>`.)
   //    3. In any case, UDFs that return nested types are not currently supported.
@@ -616,7 +616,7 @@ class rolling_aggregation_preprocessor final : public cudf::detail::simple_aggre
     return aggs;
   }
 
-  // COLLECT_LIST aggregations do not peform a rolling operation at all. They get processed
+  // COLLECT_LIST aggregations do not perform a rolling operation at all. They get processed
   // entirely in the finalize() step.
   std::vector<std::unique_ptr<aggregation>> visit(
     data_type, cudf::detail::collect_list_aggregation const&) override
@@ -624,7 +624,7 @@ class rolling_aggregation_preprocessor final : public cudf::detail::simple_aggre
     return {};
   }
 
-  // COLLECT_SET aggregations do not peform a rolling operation at all. They get processed
+  // COLLECT_SET aggregations do not perform a rolling operation at all. They get processed
   // entirely in the finalize() step.
   std::vector<std::unique_ptr<aggregation>> visit(
     data_type, cudf::detail::collect_set_aggregation const&) override
diff --git a/cpp/src/rolling/rolling_detail.hpp b/cpp/src/rolling/rolling_detail.hpp
index bd64cc39f47..5fabcf5b14e 100644
--- a/cpp/src/rolling/rolling_detail.hpp
+++ b/cpp/src/rolling/rolling_detail.hpp
@@ -29,30 +29,30 @@ namespace detail {
 // store functor
 template <typename T, bool is_mean = false>
 struct rolling_store_output_functor {
-  CUDA_HOST_DEVICE_CALLABLE void operator()(T &out, T &val, size_type count) { out = val; }
+  CUDA_HOST_DEVICE_CALLABLE void operator()(T& out, T& val, size_type count) { out = val; }
 };
 
 // Specialization for MEAN
 template <typename _T>
 struct rolling_store_output_functor<_T, true> {
   // SFINAE for non-bool types
-  template <typename T                                                              = _T,
-            std::enable_if_t<!(cudf::is_boolean<T>() || cudf::is_timestamp<T>())> * = nullptr>
-  CUDA_HOST_DEVICE_CALLABLE void operator()(T &out, T &val, size_type count)
+  template <typename T                                                             = _T,
+            std::enable_if_t<!(cudf::is_boolean<T>() || cudf::is_timestamp<T>())>* = nullptr>
+  CUDA_HOST_DEVICE_CALLABLE void operator()(T& out, T& val, size_type count)
   {
     out = val / count;
   }
 
   // SFINAE for bool type
-  template <typename T = _T, std::enable_if_t<cudf::is_boolean<T>()> * = nullptr>
-  CUDA_HOST_DEVICE_CALLABLE void operator()(T &out, T &val, size_type count)
+  template <typename T = _T, std::enable_if_t<cudf::is_boolean<T>()>* = nullptr>
+  CUDA_HOST_DEVICE_CALLABLE void operator()(T& out, T& val, size_type count)
   {
     out = static_cast<int32_t>(val) / count;
   }
 
   // SFINAE for timestamp types
-  template <typename T = _T, std::enable_if_t<cudf::is_timestamp<T>()> * = nullptr>
-  CUDA_HOST_DEVICE_CALLABLE void operator()(T &out, T &val, size_type count)
+  template <typename T = _T, std::enable_if_t<cudf::is_timestamp<T>()>* = nullptr>
+  CUDA_HOST_DEVICE_CALLABLE void operator()(T& out, T& val, size_type count)
   {
     out = static_cast<T>(val.time_since_epoch() / count);
   }
diff --git a/cpp/src/rolling/rolling_jit_detail.hpp b/cpp/src/rolling/rolling_jit_detail.hpp
index bba82f4d669..7fe9b68103e 100644
--- a/cpp/src/rolling/rolling_jit_detail.hpp
+++ b/cpp/src/rolling/rolling_jit_detail.hpp
@@ -30,8 +30,8 @@ T minimum(T a, T b)
 }
 
 struct preceding_window_wrapper {
-  const cudf::size_type *d_group_offsets;
-  const cudf::size_type *d_group_labels;
+  const cudf::size_type* d_group_offsets;
+  const cudf::size_type* d_group_labels;
   cudf::size_type preceding_window;
 
   cudf::size_type operator[](cudf::size_type idx)
@@ -43,8 +43,8 @@ struct preceding_window_wrapper {
 };
 
 struct following_window_wrapper {
-  const cudf::size_type *d_group_offsets;
-  const cudf::size_type *d_group_labels;
+  const cudf::size_type* d_group_offsets;
+  const cudf::size_type* d_group_labels;
   cudf::size_type following_window;
 
   cudf::size_type operator[](cudf::size_type idx)
diff --git a/cpp/src/scalar/scalar.cpp b/cpp/src/scalar/scalar.cpp
index 546eb050a60..045bfbe0327 100644
--- a/cpp/src/scalar/scalar.cpp
+++ b/cpp/src/scalar/scalar.cpp
@@ -181,6 +181,12 @@ T fixed_point_scalar<T>::fixed_point_value(rmm::cuda_stream_view stream) const
     numeric::scaled_integer<rep_type>{_data.value(stream), numeric::scale_type{type().scale()}}};
 }
 
+template <typename T>
+fixed_point_scalar<T>::operator value_type() const
+{
+  return this->fixed_point_value(rmm::cuda_stream_default);
+}
+
 template <typename T>
 typename fixed_point_scalar<T>::rep_type* fixed_point_scalar<T>::data()
 {
diff --git a/cpp/src/sort/rank.cu b/cpp/src/sort/rank.cu
index 66548ac1e73..c8a908e44cd 100644
--- a/cpp/src/sort/rank.cu
+++ b/cpp/src/sort/rank.cu
@@ -101,7 +101,7 @@ template <typename TieType,
           typename TieIterator>
 void tie_break_ranks_transform(cudf::device_span<size_type const> dense_rank_sorted,
                                TieIterator tie_iter,
-                               column_view const &sorted_order_view,
+                               column_view const& sorted_order_view,
                                outputIterator rank_iter,
                                TieBreaker tie_breaker,
                                Transformer transformer,
@@ -227,18 +227,18 @@ void rank_average(cudf::device_span<size_type const> group_keys,
 
 }  // anonymous namespace
 
-std::unique_ptr<column> rank(column_view const &input,
+std::unique_ptr<column> rank(column_view const& input,
                              rank_method method,
                              order column_order,
                              null_policy null_handling,
                              null_order null_precedence,
                              bool percentage,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource *mr)
+                             rmm::mr::device_memory_resource* mr)
 {
-  data_type const output_type = (percentage or method == rank_method::AVERAGE)
-                                  ? data_type(type_id::FLOAT64)
-                                  : data_type(type_to_id<size_type>());
+  data_type const output_type         = (percentage or method == rank_method::AVERAGE)
+                                          ? data_type(type_id::FLOAT64)
+                                          : data_type(type_to_id<size_type>());
   std::unique_ptr<column> rank_column = [&null_handling, &output_type, &input, &stream, &mr] {
     // na_option=keep assign NA to NA values
     if (null_handling == null_policy::EXCLUDE)
@@ -329,13 +329,13 @@ std::unique_ptr<column> rank(column_view const &input,
 }
 }  // namespace detail
 
-std::unique_ptr<column> rank(column_view const &input,
+std::unique_ptr<column> rank(column_view const& input,
                              rank_method method,
                              order column_order,
                              null_policy null_handling,
                              null_order null_precedence,
                              bool percentage,
-                             rmm::mr::device_memory_resource *mr)
+                             rmm::mr::device_memory_resource* mr)
 {
   return detail::rank(input,
                       method,
diff --git a/cpp/src/strings/capitalize.cu b/cpp/src/strings/capitalize.cu
index c1e341217ab..29ff7b242e6 100644
--- a/cpp/src/strings/capitalize.cu
+++ b/cpp/src/strings/capitalize.cu
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include <strings/char_types/char_cases.h>
 #include <strings/char_types/is_flags.h>
 #include <strings/utf8.cuh>
 #include <strings/utilities.hpp>
@@ -38,12 +39,26 @@ namespace {
  * @brief Base class for capitalize and title functors.
  *
  * Utility functions here manage access to the character case and flags tables.
+ * Any derived class must supply a `capitalize_next` member function.
+ *
+ * @tparam Derived class uses the CRTP pattern to reuse code logic.
  */
+template <typename Derived>
 struct base_fn {
   character_flags_table_type const* d_flags;
   character_cases_table_type const* d_case_table;
+  special_case_mapping const* d_special_case_mapping;
+  column_device_view const d_column;
+  offset_type* d_offsets{};
+  char* d_chars{};
 
-  base_fn() : d_flags(get_character_flags_table()), d_case_table(get_character_cases_table()) {}
+  base_fn(column_device_view const& d_column)
+    : d_flags(get_character_flags_table()),
+      d_case_table(get_character_cases_table()),
+      d_special_case_mapping(get_special_case_mapping_table()),
+      d_column(d_column)
+  {
+  }
 
   using char_info = thrust::pair<uint32_t, detail::character_flags_table_type>;
 
@@ -54,94 +69,113 @@ struct base_fn {
     return char_info{code_point, flag};
   }
 
-  __device__ char_utf8 convert_char(char_info const& info) const
+  __device__ int32_t convert_char(char_info const& info, char* d_buffer) const
   {
-    return codepoint_to_utf8(d_case_table[info.first]);
-  }
-};
+    auto const code_point = info.first;
+    auto const flag       = info.second;
 
-/**
- * @brief Capitalize functor.
- *
- * This capitalizes the first letter of the string.
- * Also lower-case any characters after the first letter.
- */
-struct capitalize_fn : base_fn {
-  column_device_view const d_column;
-  offset_type* d_offsets{};
-  char* d_chars{};
+    if (!IS_SPECIAL(flag)) {
+      auto const new_char = codepoint_to_utf8(d_case_table[code_point]);
+      return d_buffer ? detail::from_char_utf8(new_char, d_buffer)
+                      : detail::bytes_in_char_utf8(new_char);
+    }
 
-  capitalize_fn(column_device_view const& d_column) : base_fn(), d_column(d_column) {}
+    special_case_mapping m = d_special_case_mapping[get_special_case_hash_index(code_point)];
+
+    auto const count  = IS_LOWER(flag) ? m.num_upper_chars : m.num_lower_chars;
+    auto const* chars = IS_LOWER(flag) ? m.upper : m.lower;
+    size_type bytes   = 0;
+    for (uint16_t idx = 0; idx < count; idx++) {
+      bytes += d_buffer
+                 ? detail::from_char_utf8(detail::codepoint_to_utf8(chars[idx]), d_buffer + bytes)
+                 : detail::bytes_in_char_utf8(detail::codepoint_to_utf8(chars[idx]));
+    }
+    return bytes;
+  }
 
+  /**
+   * @brief Operator called for each row in `d_column`.
+   *
+   * This logic is shared by capitalize() and title() functions.
+   * The derived class must supply a `capitalize_next` member function.
+   */
   __device__ void operator()(size_type idx)
   {
     if (d_column.is_null(idx)) {
       if (!d_chars) d_offsets[idx] = 0;
     }
 
+    Derived& derived  = static_cast<Derived&>(*this);
     auto const d_str  = d_column.element<string_view>(idx);
     offset_type bytes = 0;
     auto d_buffer     = d_chars ? d_chars + d_offsets[idx] : nullptr;
-    for (auto itr = d_str.begin(); itr != d_str.end(); ++itr) {
-      auto const info        = get_char_info(*itr);
+    bool capitalize   = true;
+    for (auto const chr : d_str) {
+      auto const info        = get_char_info(chr);
       auto const flag        = info.second;
-      auto const change_case = (itr == d_str.begin()) ? IS_LOWER(flag) : IS_UPPER(flag);
-      auto const new_char    = change_case ? convert_char(info) : *itr;
-
-      if (d_buffer)
-        d_buffer += detail::from_char_utf8(new_char, d_buffer);
-      else
-        bytes += detail::bytes_in_char_utf8(new_char);
+      auto const change_case = capitalize ? IS_LOWER(flag) : IS_UPPER(flag);
+
+      if (change_case) {
+        auto const char_bytes = convert_char(info, d_buffer);
+        bytes += char_bytes;
+        d_buffer += d_buffer ? char_bytes : 0;
+      } else {
+        if (d_buffer) {
+          d_buffer += detail::from_char_utf8(chr, d_buffer);
+        } else {
+          bytes += detail::bytes_in_char_utf8(chr);
+        }
+      }
+
+      // capitalize the next char if this one is a delimiter
+      capitalize = derived.capitalize_next(chr, flag);
     }
     if (!d_chars) d_offsets[idx] = bytes;
   }
 };
 
+/**
+ * @brief Capitalize functor.
+ *
+ * This capitalizes the first character of the string and lower-cases
+ * the remaining characters.
+ * If a delimiter is specified, capitalization continues within the string
+ * on the first eligible character after any delimiter.
+ */
+struct capitalize_fn : base_fn<capitalize_fn> {
+  string_view const d_delimiters;
+
+  capitalize_fn(column_device_view const& d_column, string_view const& d_delimiters)
+    : base_fn(d_column), d_delimiters(d_delimiters)
+  {
+  }
+
+  __device__ bool capitalize_next(char_utf8 const chr, character_flags_table_type const)
+  {
+    return !d_delimiters.empty() && (d_delimiters.find(chr) >= 0);
+  }
+};
+
 /**
  * @brief Title functor.
  *
  * This capitalizes the first letter of each word.
- * The beginning of a word is identified as the first alphabetic
- * character after a non-alphabetic character.
- * Also, lower-case all other alpabetic characters.
+ * The beginning of a word is identified as the first sequence_type
+ * character after a non-sequence_type character.
+ * Also, lower-case all other alphabetic characters.
  */
-struct title_fn : base_fn {
-  column_device_view const d_column;
+struct title_fn : base_fn<title_fn> {
   string_character_types sequence_type;
-  offset_type* d_offsets{};
-  char* d_chars{};
 
   title_fn(column_device_view const& d_column, string_character_types sequence_type)
-    : base_fn(), d_column(d_column), sequence_type(sequence_type)
+    : base_fn(d_column), sequence_type(sequence_type)
   {
   }
 
-  __device__ void operator()(size_type idx)
+  __device__ bool capitalize_next(char_utf8 const, character_flags_table_type const flag)
   {
-    if (d_column.is_null(idx)) {
-      if (!d_chars) d_offsets[idx] = 0;
-    }
-
-    auto const d_str  = d_column.element<string_view>(idx);
-    offset_type bytes = 0;
-    auto d_buffer     = d_chars ? d_chars + d_offsets[idx] : nullptr;
-    bool capitalize   = true;
-    for (auto itr = d_str.begin(); itr != d_str.end(); ++itr) {
-      auto const info = get_char_info(*itr);
-      auto const flag = info.second;
-      auto const change_case =
-        (flag & sequence_type) && (capitalize ? IS_LOWER(flag) : IS_UPPER(flag));
-      auto const new_char = change_case ? convert_char(info) : *itr;
-      // capitalize the next char if this one is not a sequence_type
-      capitalize = (flag & sequence_type) == 0;
-
-      if (d_buffer)
-        d_buffer += detail::from_char_utf8(new_char, d_buffer);
-      else
-        bytes += detail::bytes_in_char_utf8(new_char);
-    }
-    if (!d_chars) d_offsets[idx] = bytes;
-  }
+    return (flag & sequence_type) == 0;
+  };
 };
 
 /**
@@ -154,10 +188,10 @@ struct title_fn : base_fn {
  * @param mr Device memory resource used for allocating the new device_buffer
  */
 template <typename CapitalFn>
-std::unique_ptr<column> capitalize_utility(CapitalFn cfn,
-                                           strings_column_view const& input,
-                                           rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> capitalizer(CapitalFn cfn,
+                                    strings_column_view const& input,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::mr::device_memory_resource* mr)
 {
   auto children = cudf::strings::detail::make_strings_children(cfn, input.size(), stream, mr);
 
@@ -173,12 +207,15 @@ std::unique_ptr<column> capitalize_utility(CapitalFn cfn,
 }  // namespace
 
 std::unique_ptr<column> capitalize(strings_column_view const& input,
+                                   string_scalar const& delimiters,
                                    rmm::cuda_stream_view stream,
                                    rmm::mr::device_memory_resource* mr)
 {
+  CUDF_EXPECTS(delimiters.is_valid(stream), "Delimiter must be a valid string");
   if (input.is_empty()) return make_empty_column(data_type{type_id::STRING});
-  auto d_column = column_device_view::create(input.parent(), stream);
-  return capitalize_utility(capitalize_fn{*d_column}, input, stream, mr);
+  auto const d_column     = column_device_view::create(input.parent(), stream);
+  auto const d_delimiters = delimiters.value(stream);
+  return capitalizer(capitalize_fn{*d_column, d_delimiters}, input, stream, mr);
 }
 
 std::unique_ptr<column> title(strings_column_view const& input,
@@ -188,16 +225,17 @@ std::unique_ptr<column> title(strings_column_view const& input,
 {
   if (input.is_empty()) return make_empty_column(data_type{type_id::STRING});
   auto d_column = column_device_view::create(input.parent(), stream);
-  return capitalize_utility(title_fn{*d_column, sequence_type}, input, stream, mr);
+  return capitalizer(title_fn{*d_column, sequence_type}, input, stream, mr);
 }
 
 }  // namespace detail
 
 std::unique_ptr<column> capitalize(strings_column_view const& input,
+                                   string_scalar const& delimiter,
                                    rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::capitalize(input, rmm::cuda_stream_default, mr);
+  return detail::capitalize(input, delimiter, rmm::cuda_stream_default, mr);
 }
 
 std::unique_ptr<column> title(strings_column_view const& input,
diff --git a/cpp/src/strings/combine/join.cu b/cpp/src/strings/combine/join.cu
index 5a69ac7b3d5..ccbedf99bc2 100644
--- a/cpp/src/strings/combine/join.cu
+++ b/cpp/src/strings/combine/join.cu
@@ -94,9 +94,9 @@ std::unique_ptr<column> join_strings(strings_column_view const& strings,
   // only one entry so it is either all valid or all null
   auto const null_count =
     static_cast<size_type>(strings.null_count() == strings_count && !narep.is_valid());
-  auto null_mask = null_count
-                     ? cudf::detail::create_null_mask(1, cudf::mask_state::ALL_NULL, stream, mr)
-                     : rmm::device_buffer{0, stream, mr};
+  auto null_mask    = null_count
+                        ? cudf::detail::create_null_mask(1, cudf::mask_state::ALL_NULL, stream, mr)
+                        : rmm::device_buffer{0, stream, mr};
   auto chars_column = create_chars_child_column(bytes, stream, mr);
   auto d_chars      = chars_column->mutable_view().data<char>();
   thrust::for_each_n(
diff --git a/cpp/src/strings/combine/join_list_elements.cu b/cpp/src/strings/combine/join_list_elements.cu
index c012663794b..2ef27759124 100644
--- a/cpp/src/strings/combine/join_list_elements.cu
+++ b/cpp/src/strings/combine/join_list_elements.cu
@@ -135,8 +135,8 @@ struct compute_size_and_concatenate_fn {
 struct scalar_separator_fn {
   string_scalar_device_view const d_separator;
 
-  __device__ bool is_null_list(column_device_view const& lists_dv, size_type const idx) const
-    noexcept
+  __device__ bool is_null_list(column_device_view const& lists_dv,
+                               size_type const idx) const noexcept
   {
     return lists_dv.is_null(idx);
   }
@@ -202,8 +202,8 @@ struct column_separators_fn {
   column_device_view const separators_dv;
   string_scalar_device_view const sep_narep_dv;
 
-  __device__ bool is_null_list(column_device_view const& lists_dv, size_type const idx) const
-    noexcept
+  __device__ bool is_null_list(column_device_view const& lists_dv,
+                               size_type const idx) const noexcept
   {
     return lists_dv.is_null(idx) || (separators_dv.is_null(idx) && !sep_narep_dv.is_valid());
   }
diff --git a/cpp/src/strings/contains.cu b/cpp/src/strings/contains.cu
index e2188365785..628dbcb8755 100644
--- a/cpp/src/strings/contains.cu
+++ b/cpp/src/strings/contains.cu
@@ -56,8 +56,8 @@ struct contains_fn {
     if (d_strings.is_null(idx)) return 0;
     string_view d_str = d_strings.element<string_view>(idx);
     int32_t begin     = 0;
-    int32_t end       = bmatch ? 1  // match only the beginning of the string;
-                         : -1;      // this handles empty strings too
+    int32_t end       = bmatch ? 1    // match only the beginning of the string;
+                               : -1;  // this handles empty strings too
     return static_cast<bool>(prog.find<stack_size>(idx, d_str, begin, end));
   }
 };
diff --git a/cpp/src/strings/convert/convert_datetime.cu b/cpp/src/strings/convert/convert_datetime.cu
index 0ec13b3648b..d804ac66961 100644
--- a/cpp/src/strings/convert/convert_datetime.cu
+++ b/cpp/src/strings/convert/convert_datetime.cu
@@ -796,7 +796,8 @@ struct datetime_formatter {
       val       = val / 10;
     }
     ptr = tmpl + bytes - 1;
-    while (bytes-- > 0) *str++ = *ptr--;
+    while (bytes-- > 0)
+      *str++ = *ptr--;
     return str;
   }
 
diff --git a/cpp/src/strings/convert/convert_durations.cu b/cpp/src/strings/convert/convert_durations.cu
index 7e6769a869b..aaee8c45169 100644
--- a/cpp/src/strings/convert/convert_durations.cu
+++ b/cpp/src/strings/convert/convert_durations.cu
@@ -267,7 +267,8 @@ struct duration_to_string_fn : public duration_to_string_size_fn<T> {
     }
     digits_idx = std::max(digits_idx, min_digits);
     // digits are backwards, reverse the string into the output
-    while (digits_idx-- > 0) *str++ = digits[digits_idx];
+    while (digits_idx-- > 0)
+      *str++ = digits[digits_idx];
     return str;
   }
 
diff --git a/cpp/src/strings/convert/convert_fixed_point.cu b/cpp/src/strings/convert/convert_fixed_point.cu
index 94c34f92c66..2f57b38249f 100644
--- a/cpp/src/strings/convert/convert_fixed_point.cu
+++ b/cpp/src/strings/convert/convert_fixed_point.cu
@@ -192,7 +192,7 @@ namespace {
  * @brief Calculate the size of the each string required for
  * converting each value in base-10 format.
  *
- * ouput format is [-]integer.fraction
+ * output format is [-]integer.fraction
  */
 template <typename DecimalType>
 struct decimal_to_string_size_fn {
diff --git a/cpp/src/strings/convert/convert_floats.cu b/cpp/src/strings/convert/convert_floats.cu
index d4d6974cef5..b0910acb2a2 100644
--- a/cpp/src/strings/convert/convert_floats.cu
+++ b/cpp/src/strings/convert/convert_floats.cu
@@ -234,7 +234,7 @@ struct ftos_converter {
   static constexpr double upper_limit = 1000000000;  // max is 1x10^9
   static constexpr double lower_limit = 0.0001;      // printf uses scientific notation below this
   // Tables for doing normalization: converting to exponent form
-  // IEEE double float has maximum exponent of 305 so these should cover everthing
+  // IEEE double float has maximum exponent of 305 so these should cover everything
   const double upper10[9]  = {10, 100, 10000, 1e8, 1e16, 1e32, 1e64, 1e128, 1e256};
   const double lower10[9]  = {.1, .01, .0001, 1e-8, 1e-16, 1e-32, 1e-64, 1e-128, 1e-256};
   const double blower10[9] = {1.0, .1, .001, 1e-7, 1e-15, 1e-31, 1e-63, 1e-127, 1e-255};
@@ -252,7 +252,8 @@ struct ftos_converter {
       *ptr++ = (char)('0' + (value % 10));
       value /= 10;
     }
-    while (ptr != buffer) *output++ = *--ptr;  // 54321 -> 12345
+    while (ptr != buffer)
+      *output++ = *--ptr;  // 54321 -> 12345
     return output;
   }
 
diff --git a/cpp/src/strings/convert/convert_hex.cu b/cpp/src/strings/convert/convert_hex.cu
index 7043174f5bf..c624819999f 100644
--- a/cpp/src/strings/convert/convert_hex.cu
+++ b/cpp/src/strings/convert/convert_hex.cu
@@ -154,7 +154,9 @@ struct integer_to_hex_fn {
     // compute the number of output bytes
     int bytes      = sizeof(IntegerType);
     int byte_index = sizeof(IntegerType);
-    while ((--byte_index > 0) && (value_bytes[byte_index] & 0xFF) == 0) { --bytes; }
+    while ((--byte_index > 0) && (value_bytes[byte_index] & 0xFF) == 0) {
+      --bytes;
+    }
 
     // create output
     byte_index = bytes - 1;
diff --git a/cpp/src/strings/convert/convert_ipv4.cu b/cpp/src/strings/convert/convert_ipv4.cu
index d7b79547f29..4e323b98a2e 100644
--- a/cpp/src/strings/convert/convert_ipv4.cu
+++ b/cpp/src/strings/convert/convert_ipv4.cu
@@ -146,7 +146,8 @@ struct integers_to_ipv4_fn {
       else {
         char digits[3];
         int num_digits = convert(value, digits);
-        while (num_digits-- > 0) *out_ptr++ = digits[num_digits];
+        while (num_digits-- > 0)
+          *out_ptr++ = digits[num_digits];
       }
       if ((n + 1) < 4) *out_ptr++ = '.';
       shift_bits -= 8;
diff --git a/cpp/src/strings/convert/convert_urls.cu b/cpp/src/strings/convert/convert_urls.cu
index 33647c7b22f..abf2dc25097 100644
--- a/cpp/src/strings/convert/convert_urls.cu
+++ b/cpp/src/strings/convert/convert_urls.cu
@@ -335,9 +335,9 @@ std::unique_ptr<column> url_decode(
   size_type chars_start = (strings.offset() == 0) ? 0
                                                   : cudf::detail::get_value<int32_t>(
                                                       strings.offsets(), strings.offset(), stream);
-  size_type chars_end = (offset_count == strings.offsets().size())
-                          ? strings.chars_size()
-                          : cudf::detail::get_value<int32_t>(
+  size_type chars_end   = (offset_count == strings.offsets().size())
+                            ? strings.chars_size()
+                            : cudf::detail::get_value<int32_t>(
                               strings.offsets(), strings.offset() + strings_count, stream);
   size_type chars_bytes = chars_end - chars_start;
 
diff --git a/cpp/src/strings/convert/utilities.cuh b/cpp/src/strings/convert/utilities.cuh
index 75ae7b3af6c..746923526a1 100644
--- a/cpp/src/strings/convert/utilities.cuh
+++ b/cpp/src/strings/convert/utilities.cuh
@@ -81,7 +81,8 @@ __device__ inline size_type integer_to_string(IntegerType value, char* d_buffer)
   char* ptr = d_buffer;
   if (is_negative) *ptr++ = '-';
   // digits are backwards, reverse the string into the output
-  while (digits_idx-- > 0) *ptr++ = digits[digits_idx];
+  while (digits_idx-- > 0)
+    *ptr++ = digits[digits_idx];
   return bytes;
 }
 
diff --git a/cpp/src/strings/copying/shift.cu b/cpp/src/strings/copying/shift.cu
new file mode 100644
index 00000000000..3545ec6d259
--- /dev/null
+++ b/cpp/src/strings/copying/shift.cu
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/copy.hpp>
+#include <cudf/detail/get_value.cuh>
+#include <cudf/strings/detail/copying.hpp>
+#include <cudf/strings/detail/utilities.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/transform.h>
+
+namespace cudf::strings::detail {
+
+namespace {
+
+struct adjust_offsets_fn {
+  column_device_view const d_column;
+  string_view const d_filler;
+  size_type const offset;
+
+  __device__ offset_type operator()(size_type idx)
+  {
+    if (offset < 0) {
+      auto const first      = d_column.element<offset_type>(-offset);
+      auto const last_index = d_column.size() + offset;
+      if (idx < last_index) {
+        return d_column.element<offset_type>(idx - offset) - first;
+      } else {
+        auto const last = d_column.element<offset_type>(d_column.size() - 1);
+        return (last - first) + ((idx - last_index + 1) * d_filler.size_bytes());
+      }
+    } else {
+      if (idx < offset) {
+        return idx * d_filler.size_bytes();
+      } else {
+        auto const total_filler = d_filler.size_bytes() * offset;
+        return total_filler + d_column.element<offset_type>(idx - offset);
+      }
+    }
+  }
+};
+
+struct shift_chars_fn {
+  column_device_view const d_column;
+  string_view const d_filler;
+  size_type const offset;
+
+  __device__ char operator()(size_type idx)
+  {
+    if (offset < 0) {
+      auto const last_index = -offset;
+      if (idx < last_index) {
+        auto const first_index = d_column.size() + offset;
+        return d_column.element<char>(idx + first_index);
+      } else {
+        auto const char_index = idx - last_index;
+        return d_filler.data()[char_index % d_filler.size_bytes()];
+      }
+    } else {
+      if (idx < offset) {
+        return d_filler.data()[idx % d_filler.size_bytes()];
+      } else {
+        return d_column.element<char>(idx - offset);
+      }
+    }
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<column> shift(strings_column_view const& input,
+                              size_type offset,
+                              scalar const& fill_value,
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource* mr)
+{
+  auto d_fill_str = static_cast<string_scalar const&>(fill_value).value(stream);
+
+  // output offsets column is the same size as the input
+  auto const input_offsets =
+    cudf::slice(input.offsets(), {input.offset(), input.offset() + input.size() + 1}).front();
+  auto const offsets_size = input_offsets.size();
+  auto offsets_column     = cudf::detail::allocate_like(
+    input_offsets, offsets_size, mask_allocation_policy::NEVER, stream, mr);
+
+  // run kernel to simultaneously shift and adjust the values in the output offsets column
+  auto d_offsets = mutable_column_device_view::create(offsets_column->mutable_view(), stream);
+  auto const d_input_offsets = column_device_view::create(input_offsets, stream);
+  thrust::transform(rmm::exec_policy(stream),
+                    thrust::counting_iterator<size_type>(0),
+                    thrust::counting_iterator<size_type>(offsets_size),
+                    d_offsets->data<offset_type>(),
+                    adjust_offsets_fn{*d_input_offsets, d_fill_str, offset});
+
+  // compute the shift-offset for the output characters child column
+  auto const shift_offset = [&] {
+    auto const index = (offset >= 0) ? offset : offsets_size - 1 + offset;
+    return (offset < 0 ? -1 : 1) *
+           cudf::detail::get_value<offset_type>(offsets_column->view(), index, stream);
+  }();
+
+  // create output chars child column
+  auto const chars_size =
+    cudf::detail::get_value<offset_type>(offsets_column->view(), offsets_size - 1, stream);
+  auto chars_column = create_chars_child_column(chars_size, stream, mr);
+  auto d_chars      = mutable_column_device_view::create(chars_column->mutable_view(), stream);
+  auto const d_input_chars = column_device_view::create(input.chars(), stream);
+
+  // run kernel to shift the characters
+  thrust::transform(rmm::exec_policy(stream),
+                    thrust::counting_iterator<size_type>(0),
+                    thrust::counting_iterator<size_type>(chars_size),
+                    d_chars->data<char>(),
+                    shift_chars_fn{*d_input_chars, d_fill_str, shift_offset});
+
+  // caller sets the null-mask
+  return make_strings_column(input.size(),
+                             std::move(offsets_column),
+                             std::move(chars_column),
+                             0,
+                             rmm::device_buffer{},
+                             stream,
+                             mr);
+}
+
+}  // namespace cudf::strings::detail
diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu
index dfdd3226844..409e1892c91 100644
--- a/cpp/src/strings/json/json_path.cu
+++ b/cpp/src/strings/json/json_path.cu
@@ -99,6 +99,44 @@ class parser {
     return false;
   }
 
+  CUDA_HOST_DEVICE_CALLABLE bool is_hex_digit(char c)
+  {
+    return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
+  }
+
+  CUDA_HOST_DEVICE_CALLABLE int64_t chars_left() { return input_len - ((pos - input) + 1); }
+
+  /**
+   * @brief Parse an escape sequence.
+   *
+   * Must be a valid sequence as specified by the JSON format
+   * https://www.json.org/json-en.html
+   *
+   * @returns True on success or false on fail.
+   */
+  CUDA_HOST_DEVICE_CALLABLE bool parse_escape_seq()
+  {
+    if (*pos != '\\') { return false; }
+    char c = *++pos;
+
+    // simple case
+    if (c == '\"' || c == '\\' || c == '/' || c == 'b' || c == 'f' || c == 'n' || c == 'r' ||
+        c == 't') {
+      pos++;
+      return true;
+    }
+
+    // hex digits: must be of the form uXXXX  where each X is a valid hex digit
+    if (c == 'u' && chars_left() >= 4 && is_hex_digit(pos[1]) && is_hex_digit(pos[2]) &&
+        is_hex_digit(pos[3]) && is_hex_digit(pos[4])) {
+      pos += 5;
+      return true;
+    }
+
+    // an illegal escape sequence.
+    return false;
+  }
+
   /**
    * @brief Parse a quote-enclosed JSON string.
    *
@@ -123,12 +161,16 @@ class parser {
 
         const char* start = ++pos;
         while (!eof()) {
-          if (*pos == quote) {
+          // handle escaped characters
+          if (*pos == '\\') {
+            if (!parse_escape_seq()) { return parse_result::ERROR; }
+          } else if (*pos == quote) {
             str = string_view(start, pos - start);
             pos++;
             return parse_result::SUCCESS;
+          } else {
+            pos++;
           }
-          pos++;
         }
       }
     }
@@ -230,15 +272,22 @@ class json_state : private parser {
       int arr_count = 0;
 
       while (!eof(end)) {
-        // could do some additional checks here. we know our current
-        // element type, so we could be more strict on what kinds of
-        // characters we expect to see.
-        switch (*end++) {
-          case '{': obj_count++; break;
-          case '}': obj_count--; break;
-          case '[': arr_count++; break;
-          case ']': arr_count--; break;
-          default: break;
+        // parse strings explicitly so we handle all interesting corner cases (such as strings
+        // containing {, }, [ or ]
+        if (is_quote(*end)) {
+          string_view str;
+          pos = end;
+          if (parse_string(str, false, *end) == parse_result::ERROR) { return parse_result::ERROR; }
+          end = pos;
+        } else {
+          char const c = *end++;
+          switch (c) {
+            case '{': obj_count++; break;
+            case '}': obj_count--; break;
+            case '[': arr_count++; break;
+            case ']': arr_count--; break;
+            default: break;
+          }
         }
         if (obj_count == 0 && arr_count == 0) { break; }
       }
@@ -620,7 +669,7 @@ std::pair<thrust::optional<rmm::device_uvector<path_operator>>, int> build_comma
     if (op.type == path_operator_type::ROOT) {
       CUDF_EXPECTS(h_operators.size() == 0, "Root operator ($) can only exist at the root");
     }
-    // if we havent' gotten a root operator to start, and we're not empty, quietly push a
+    // if we have not gotten a root operator to start, and we're not empty, quietly push a
     // root operator now.
     if (h_operators.size() == 0 && op.type != path_operator_type::ROOT &&
         op.type != path_operator_type::END) {
diff --git a/cpp/src/strings/padding.cu b/cpp/src/strings/padding.cu
index 253bf846993..6fee47ea225 100644
--- a/cpp/src/strings/padding.cu
+++ b/cpp/src/strings/padding.cu
@@ -95,7 +95,8 @@ std::unique_ptr<column> pad(
         string_view d_str = d_strings.element<string_view>(idx);
         auto length       = d_str.length();
         char* ptr         = d_chars + d_offsets[idx];
-        while (length++ < width) ptr += from_char_utf8(d_fill_char, ptr);
+        while (length++ < width)
+          ptr += from_char_utf8(d_fill_char, ptr);
         copy_string(ptr, d_str);
       });
   } else if (side == pad_side::RIGHT) {
@@ -109,7 +110,8 @@ std::unique_ptr<column> pad(
         auto length       = d_str.length();
         char* ptr         = d_chars + d_offsets[idx];
         ptr               = copy_string(ptr, d_str);
-        while (length++ < width) ptr += from_char_utf8(d_fill_char, ptr);
+        while (length++ < width)
+          ptr += from_char_utf8(d_fill_char, ptr);
       });
   } else if (side == pad_side::BOTH) {
     thrust::for_each_n(
@@ -124,9 +126,11 @@ std::unique_ptr<column> pad(
         auto right_pad    = (width & 1) ? pad / 2 : (pad - pad / 2);  // odd width = right-justify
         auto left_pad =
           pad - right_pad;  // e.g. width=7 gives "++foxx+" while width=6 gives "+fox++"
-        while (left_pad-- > 0) ptr += from_char_utf8(d_fill_char, ptr);
+        while (left_pad-- > 0)
+          ptr += from_char_utf8(d_fill_char, ptr);
         ptr = copy_string(ptr, d_str);
-        while (right_pad-- > 0) ptr += from_char_utf8(d_fill_char, ptr);
+        while (right_pad-- > 0)
+          ptr += from_char_utf8(d_fill_char, ptr);
       });
   }
 
@@ -181,7 +185,8 @@ std::unique_ptr<column> zfill(
                        string_view d_str = d_strings.element<string_view>(idx);
                        auto length       = d_str.length();
                        char* out_ptr     = d_chars + d_offsets[idx];
-                       while (length++ < width) *out_ptr++ = '0';  // prepend zero char
+                       while (length++ < width)
+                         *out_ptr++ = '0';  // prepend zero char
                        copy_string(out_ptr, d_str);
                      });
 
diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp
index 6cac49d3c26..0e00221dabf 100644
--- a/cpp/src/strings/regex/regcomp.cpp
+++ b/cpp/src/strings/regex/regcomp.cpp
@@ -701,11 +701,13 @@ class regex_compiler {
         regex_parser::Item item = in[i];
         if (item.d.yycount.n <= 0) {
           // need to erase
-          for (std::size_t j = 0; j < i - rep_start; j++) out.pop_back();
+          for (std::size_t j = 0; j < i - rep_start; j++)
+            out.pop_back();
         } else {
           // repeat
           for (int j = 1; j < item.d.yycount.n; j++)
-            for (std::size_t k = rep_start; k < i; k++) out.push_back(in[k]);
+            for (std::size_t k = rep_start; k < i; k++)
+              out.push_back(in[k]);
         }
 
         // optional repeats
@@ -715,7 +717,8 @@ class regex_compiler {
             o_item.t    = LBRA_NC;
             o_item.d.yy = 0;
             out.push_back(o_item);
-            for (std::size_t k = rep_start; k < i; k++) out.push_back(in[k]);
+            for (std::size_t k = rep_start; k < i; k++)
+              out.push_back(in[k]);
           }
           for (int j = item.d.yycount.n; j < item.d.yycount.m; j++) {
             regex_parser::Item o_item;
@@ -746,7 +749,8 @@ class regex_compiler {
             }
           } else  // copy it once then put '*'
           {
-            for (std::size_t k = rep_start; k < i; k++) out.push_back(in[k]);
+            for (std::size_t k = rep_start; k < i; k++)
+              out.push_back(in[k]);
 
             if (item.t == COUNTED) {
               o_item.t = STAR;
@@ -841,12 +845,14 @@ void reprog::optimize1()
     if (_insts[i].type != NOP) {
       {
         int target_id = _insts[i].u2.next_id;
-        while (_insts[target_id].type == NOP) target_id = _insts[target_id].u2.next_id;
+        while (_insts[target_id].type == NOP)
+          target_id = _insts[target_id].u2.next_id;
         _insts[i].u2.next_id = target_id;
       }
       if (_insts[i].type == OR) {
         int target_id = _insts[i].u1.right_id;
-        while (_insts[target_id].type == NOP) target_id = _insts[target_id].u2.next_id;
+        while (_insts[target_id].type == NOP)
+          target_id = _insts[target_id].u2.next_id;
         _insts[i].u1.right_id = target_id;
       }
     }
@@ -854,7 +860,8 @@ void reprog::optimize1()
   // skip NOPs from the beginning
   {
     int target_id = _startinst_id;
-    while (_insts[target_id].type == NOP) target_id = _insts[target_id].u2.next_id;
+    while (_insts[target_id].type == NOP)
+      target_id = _insts[target_id].u2.next_id;
     _startinst_id = target_id;
   }
   // actually remove the no-ops
@@ -950,7 +957,8 @@ void reprog::print()
   printf("startinst_id=%d\n", _startinst_id);
   if (_startinst_ids.size() > 0) {
     printf("startinst_ids:");
-    for (size_t i = 0; i < _startinst_ids.size(); i++) printf(" %d", _startinst_ids[i]);
+    for (size_t i = 0; i < _startinst_ids.size(); i++)
+      printf(" %d", _startinst_ids[i]);
     printf("\n");
   }
 
diff --git a/cpp/src/strings/regex/regex.inl b/cpp/src/strings/regex/regex.inl
index eddda3fe0eb..854fce15fd4 100644
--- a/cpp/src/strings/regex/regex.inl
+++ b/cpp/src/strings/regex/regex.inl
@@ -231,7 +231,8 @@ __device__ inline int32_t reprog_device::regexec(
     if (((eos < 0) || (pos < eos)) && match == 0) {
       int32_t i = 0;
       auto ids  = startinst_ids();
-      while (ids[i] >= 0) jnk.list1->activate(ids[i++], (group_id == 0 ? pos : -1), -1);
+      while (ids[i] >= 0)
+        jnk.list1->activate(ids[i++], (group_id == 0 ? pos : -1), -1);
     }
 
     c = static_cast<char32_t>(pos >= txtlen ? 0 : *itr);
diff --git a/cpp/src/strings/replace/multi_re.cu b/cpp/src/strings/replace/multi_re.cu
index 2d9d40e2d68..5b058d7b696 100644
--- a/cpp/src/strings/replace/multi_re.cu
+++ b/cpp/src/strings/replace/multi_re.cu
@@ -56,7 +56,7 @@ struct replace_multi_regex_fn {
   reprog_device* progs;  // array of regex progs
   size_type number_of_patterns;
   found_range* d_found_ranges;       // working array matched (begin,end) values
-  column_device_view const d_repls;  // replacment strings
+  column_device_view const d_repls;  // replacement strings
   int32_t* d_offsets{};              // these are null when
   char* d_chars{};                   // only computing size
 
@@ -105,8 +105,8 @@ struct replace_multi_regex_fn {
         size_type end      = d_ranges[ptn_idx].second;
         string_view d_repl = d_repls.size() > 1 ? d_repls.element<string_view>(ptn_idx)
                                                 : d_repls.element<string_view>(0);
-        auto spos = d_str.byte_offset(begin);
-        auto epos = d_str.byte_offset(end);
+        auto spos          = d_str.byte_offset(begin);
+        auto epos          = d_str.byte_offset(end);
         nbytes += d_repl.size_bytes() - (epos - spos);
         if (out_ptr) {  // copy unmodified content plus new replacement string
           out_ptr = copy_and_increment(out_ptr, in_ptr + lpos, spos - lpos);
diff --git a/cpp/src/strings/replace/replace.cu b/cpp/src/strings/replace/replace.cu
index 4185e6db685..979974a2fdb 100644
--- a/cpp/src/strings/replace/replace.cu
+++ b/cpp/src/strings/replace/replace.cu
@@ -568,9 +568,9 @@ std::unique_ptr<column> replace<replace_algorithm::AUTO>(strings_column_view con
     (strings.offset() == 0)
       ? 0
       : cudf::detail::get_value<int32_t>(strings.offsets(), strings.offset(), stream);
-  size_type const chars_end = (offset_count == strings.offsets().size())
-                                ? strings.chars_size()
-                                : cudf::detail::get_value<int32_t>(
+  size_type const chars_end   = (offset_count == strings.offsets().size())
+                                  ? strings.chars_size()
+                                  : cudf::detail::get_value<int32_t>(
                                     strings.offsets(), strings.offset() + strings_count, stream);
   size_type const chars_bytes = chars_end - chars_start;
 
@@ -604,11 +604,11 @@ std::unique_ptr<column> replace<replace_algorithm::CHAR_PARALLEL>(
   auto const offset_count  = strings_count + 1;
   auto const d_offsets     = strings.offsets().data<int32_t>() + strings.offset();
   size_type chars_start    = (strings.offset() == 0) ? 0
-                                                  : cudf::detail::get_value<int32_t>(
+                                                     : cudf::detail::get_value<int32_t>(
                                                       strings.offsets(), strings.offset(), stream);
-  size_type chars_end = (offset_count == strings.offsets().size())
-                          ? strings.chars_size()
-                          : cudf::detail::get_value<int32_t>(
+  size_type chars_end      = (offset_count == strings.offsets().size())
+                               ? strings.chars_size()
+                               : cudf::detail::get_value<int32_t>(
                               strings.offsets(), strings.offset() + strings_count, stream);
   return replace_char_parallel(
     strings, chars_start, chars_end, d_target, d_repl, maxrepl, stream, mr);
diff --git a/cpp/src/strings/split/split.cu b/cpp/src/strings/split/split.cu
index ae0ea4b90e6..9c5be1c9ca3 100644
--- a/cpp/src/strings/split/split.cu
+++ b/cpp/src/strings/split/split.cu
@@ -154,10 +154,10 @@ struct split_tokenizer_fn : base_split_tokenizer {
       auto next_delim = ((idx + col) < positions_count)  // boundary check for delims in last string
                           ? (base_ptr + d_positions[idx + col])  // start of next delimiter
                           : str_end_ptr;                         // or end of this string
-      auto eptr = (next_delim < str_end_ptr)      // make sure delimiter is inside this string
+      auto eptr       = (next_delim < str_end_ptr)      // make sure delimiter is inside this string
                       && (col + 1 < token_count)  // and this is not the last token
-                    ? next_delim
-                    : str_end_ptr;
+                          ? next_delim
+                          : str_end_ptr;
       // store the token into the output vector
       d_tokens[col * d_strings.size()] =
         string_index_pair{str_ptr, static_cast<size_type>(eptr - str_ptr)};
@@ -281,10 +281,10 @@ struct rsplit_tokenizer_fn : base_split_tokenizer {
       auto prev_delim = (idx >= col)  // boundary check for delims in first string
                           ? (base_ptr + d_positions[idx - col] + 1)  // end of prev delimiter
                           : str_begin_ptr;                           // or the start of this string
-      auto sptr = (prev_delim > str_begin_ptr)    // make sure delimiter is inside the string
+      auto sptr       = (prev_delim > str_begin_ptr)    // make sure delimiter is inside the string
                       && (col + 1 < token_count)  // and this is not the last token
-                    ? prev_delim
-                    : str_begin_ptr;
+                          ? prev_delim
+                          : str_begin_ptr;
       // store the token into the output -- building the array backwards
       d_tokens[d_strings.size() * (token_count - 1 - col)] =
         string_index_pair{sptr, static_cast<size_type>(str_ptr - sptr)};
diff --git a/cpp/src/table/table.cpp b/cpp/src/table/table.cpp
index 4cd85fc5e7e..904ce5470ce 100644
--- a/cpp/src/table/table.cpp
+++ b/cpp/src/table/table.cpp
@@ -28,7 +28,9 @@ table::table(table const& other) : _num_rows{other.num_rows()}
 {
   CUDF_FUNC_RANGE();
   _columns.reserve(other._columns.size());
-  for (auto const& c : other._columns) { _columns.emplace_back(std::make_unique<column>(*c)); }
+  for (auto const& c : other._columns) {
+    _columns.emplace_back(std::make_unique<column>(*c));
+  }
 }
 
 // Move the contents of a vector `unique_ptr<column>`
@@ -53,7 +55,9 @@ table::table(table_view view, rmm::cuda_stream_view stream, rmm::mr::device_memo
 {
   CUDF_FUNC_RANGE();
   _columns.reserve(view.num_columns());
-  for (auto const& c : view) { _columns.emplace_back(std::make_unique<column>(c, stream, mr)); }
+  for (auto const& c : view) {
+    _columns.emplace_back(std::make_unique<column>(c, stream, mr));
+  }
 }
 
 // Create immutable view
@@ -61,7 +65,9 @@ table_view table::view() const
 {
   std::vector<column_view> views;
   views.reserve(_columns.size());
-  for (auto const& c : _columns) { views.push_back(c->view()); }
+  for (auto const& c : _columns) {
+    views.push_back(c->view());
+  }
   return table_view{views};
 }
 
@@ -70,7 +76,9 @@ mutable_table_view table::mutable_view()
 {
   std::vector<mutable_column_view> views;
   views.reserve(_columns.size());
-  for (auto const& c : _columns) { views.push_back(c->mutable_view()); }
+  for (auto const& c : _columns) {
+    views.push_back(c->mutable_view());
+  }
   return mutable_table_view{views};
 }
 
diff --git a/cpp/src/table/table_device_view.cu b/cpp/src/table/table_device_view.cu
index 62daeed6d79..859a6be3bb0 100644
--- a/cpp/src/table/table_device_view.cu
+++ b/cpp/src/table/table_device_view.cu
@@ -55,7 +55,7 @@ template class table_device_view_base<column_device_view, table_view>;
 template class table_device_view_base<mutable_column_device_view, mutable_table_view>;
 
 namespace {
-struct is_relationally_comparable_impl {
+struct is_relationally_comparable_functor {
   template <typename T>
   constexpr bool operator()()
   {
@@ -74,7 +74,7 @@ bool is_relationally_comparable(TableView const& lhs, TableView const& rhs)
                           // TODO: possible to implement without double type dispatcher.
                           return lhs.column(i).type() == rhs.column(i).type() and
                                  type_dispatcher(lhs.column(i).type(),
-                                                 is_relationally_comparable_impl{});
+                                                 is_relationally_comparable_functor{});
                         });
 }
 
diff --git a/cpp/src/table/table_view.cpp b/cpp/src/table/table_view.cpp
index c64bf5b2823..abd909f8cfc 100644
--- a/cpp/src/table/table_view.cpp
+++ b/cpp/src/table/table_view.cpp
@@ -43,7 +43,9 @@ auto concatenate_column_views(std::vector<ViewType> const& views)
 {
   using ColumnView = typename ViewType::ColumnView;
   std::vector<ColumnView> concat_cols;
-  for (auto& view : views) { concat_cols.insert(concat_cols.end(), view.begin(), view.end()); }
+  for (auto& view : views) {
+    concat_cols.insert(concat_cols.end(), view.begin(), view.end());
+  }
   return concat_cols;
 }
 
diff --git a/cpp/src/text/generate_ngrams.cu b/cpp/src/text/generate_ngrams.cu
index cab5a54a57d..f9b2355b2ff 100644
--- a/cpp/src/text/generate_ngrams.cu
+++ b/cpp/src/text/generate_ngrams.cu
@@ -41,7 +41,7 @@ namespace {
 /**
  * @brief Generate ngrams from strings column.
  *
- * Adjacent strings are concatented with the provided separator.
+ * Adjacent strings are concatenated with the provided separator.
  * The number of adjacent strings join depends on the specified ngrams value.
  * For example: for bigrams (ngrams=2), pairs of strings are concatenated.
  */
diff --git a/cpp/src/transform/row_bit_count.cu b/cpp/src/transform/row_bit_count.cu
index f99c831e745..e20c7120571 100644
--- a/cpp/src/transform/row_bit_count.cu
+++ b/cpp/src/transform/row_bit_count.cu
@@ -380,7 +380,7 @@ __device__ size_type row_size_functor::operator()<struct_view>(column_device_vie
 /**
  * @brief Kernel for computing per-row sizes in bits.
  *
- * @param cols An span of column_device_views represeting a column hierarcy
+ * @param cols An span of column_device_views representing a column hierarchy
  * @param info An span of column_info structs corresponding the elements in `cols`
  * @param output Output span of size (# rows) where per-row bit sizes are stored
  * @param max_branch_depth Maximum depth of the span stack needed per-thread
diff --git a/cpp/src/utilities/type_checks.cpp b/cpp/src/utilities/type_checks.cpp
new file mode 100644
index 00000000000..d297148de45
--- /dev/null
+++ b/cpp/src/utilities/type_checks.cpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/dictionary/dictionary_column_view.hpp>
+#include <cudf/lists/lists_column_view.hpp>
+#include <cudf/utilities/type_checks.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+
+#include <algorithm>
+
+namespace cudf {
+namespace {
+
+struct columns_equal_fn {
+  template <typename T>
+  bool operator()(column_view const&, column_view const&)
+  {
+    return true;
+  }
+};
+
+template <>
+bool columns_equal_fn::operator()<dictionary32>(column_view const& lhs, column_view const& rhs)
+{
+  auto const kidx = dictionary_column_view::keys_column_index;
+  return lhs.num_children() > 0 and rhs.num_children() > 0
+           ? lhs.child(kidx).type() == rhs.child(kidx).type()
+           : lhs.is_empty() and rhs.is_empty();
+}
+
+template <>
+bool columns_equal_fn::operator()<list_view>(column_view const& lhs, column_view const& rhs)
+{
+  auto const& ci = lists_column_view::child_column_index;
+  return column_types_equal(lhs.child(ci), rhs.child(ci));
+}
+
+template <>
+bool columns_equal_fn::operator()<struct_view>(column_view const& lhs, column_view const& rhs)
+{
+  return lhs.num_children() == rhs.num_children() and
+         std::all_of(thrust::make_counting_iterator(0),
+                     thrust::make_counting_iterator(lhs.num_children()),
+                     [&](auto i) { return column_types_equal(lhs.child(i), rhs.child(i)); });
+}
+
+};  // namespace
+
+// Implementation note: avoid using double dispatch for this function
+// as it increases code paths to NxN for N types.
+bool column_types_equal(column_view const& lhs, column_view const& rhs)
+{
+  if (lhs.type() != rhs.type()) { return false; }
+  return type_dispatcher(lhs.type(), columns_equal_fn{}, lhs, rhs);
+}
+
+}  // namespace cudf
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 4360b418e95..ddb5d88f2d0 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -62,11 +62,13 @@ ConfigureTest(GROUPBY_TEST
     groupby/count_tests.cpp
     groupby/groups_tests.cpp
     groupby/keys_tests.cpp
+    groupby/m2_tests.cpp
     groupby/min_tests.cpp
     groupby/max_scan_tests.cpp
     groupby/max_tests.cpp
     groupby/mean_tests.cpp
     groupby/median_tests.cpp
+    groupby/merge_m2_tests.cpp
     groupby/merge_lists_tests.cpp
     groupby/merge_sets_tests.cpp
     groupby/min_scan_tests.cpp
@@ -86,6 +88,7 @@ ConfigureTest(GROUPBY_TEST
 # - join tests ------------------------------------------------------------------------------------
 ConfigureTest(JOIN_TEST
     join/join_tests.cpp
+    join/conditional_join_tests.cu
     join/cross_join_tests.cpp
     join/semi_anti_join_tests.cpp)
 
@@ -161,6 +164,8 @@ ConfigureTest(BINARY_TEST
     binaryop/binop-verify-input-test.cpp
     binaryop/binop-null-test.cpp
     binaryop/binop-integration-test.cpp
+    binaryop/binop-compiled-test.cpp
+    binaryop/binop-compiled-fixed_point-test.cpp
     binaryop/binop-generic-ptx-test.cpp
     )
 
@@ -219,6 +224,7 @@ ConfigureTest(COPYING_TEST
     copying/scatter_list_tests.cpp
     copying/scatter_list_scalar_tests.cpp
     copying/scatter_struct_tests.cpp
+    copying/scatter_struct_scalar_tests.cpp
     copying/segmented_gather_list_tests.cpp
     copying/shift_tests.cpp
     copying/slice_tests.cpp
@@ -233,7 +239,8 @@ ConfigureTest(UTILITIES_TEST
     utilities_tests/column_utilities_tests.cpp
     utilities_tests/column_wrapper_tests.cpp
     utilities_tests/lists_column_wrapper_tests.cpp
-    utilities_tests/default_stream_tests.cpp)
+    utilities_tests/default_stream_tests.cpp
+    utilities_tests/type_check_tests.cpp)
 
 ###################################################################################################
 # - span tests -------------------------------------------------------------------------------
diff --git a/cpp/tests/ast/transform_tests.cpp b/cpp/tests/ast/transform_tests.cpp
index 74937d4deea..48e19a2f587 100644
--- a/cpp/tests/ast/transform_tests.cpp
+++ b/cpp/tests/ast/transform_tests.cpp
@@ -31,10 +31,13 @@
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/table_utilities.hpp>
 
-#include <thrust/iterator/counting_iterator.h>
+#include <rmm/device_uvector.hpp>
 
+#include <algorithm>
 #include <limits>
+#include <random>
 #include <type_traits>
+#include <vector>
 
 template <typename T>
 using column_wrapper = cudf::test::fixed_width_column_wrapper<T>;
@@ -409,4 +412,46 @@ TEST_F(TransformTest, PyMod)
   cudf::test::expect_columns_equal(expected, result->view(), true);
 }
 
+TEST_F(TransformTest, BasicAdditionNulls)
+{
+  auto c_0   = column_wrapper<int32_t>{{3, 20, 1, 50}, {0, 0, 1, 1}};
+  auto c_1   = column_wrapper<int32_t>{{10, 7, 20, 0}, {0, 1, 0, 1}};
+  auto table = cudf::table_view{{c_0, c_1}};
+
+  auto col_ref_0  = cudf::ast::column_reference(0);
+  auto col_ref_1  = cudf::ast::column_reference(1);
+  auto expression = cudf::ast::expression(cudf::ast::ast_operator::ADD, col_ref_0, col_ref_1);
+
+  auto expected = column_wrapper<int32_t>{{0, 0, 0, 50}, {0, 0, 0, 1}};
+  auto result   = cudf::ast::compute_column(table, expression);
+
+  cudf::test::expect_columns_equal(expected, result->view(), true);
+}
+
+TEST_F(TransformTest, BasicAdditionLargeNulls)
+{
+  auto N = 2000;
+  auto a = thrust::make_counting_iterator(0);
+
+  auto validities = std::vector<int32_t>(N);
+  std::fill(validities.begin(), validities.begin() + N / 2, 0);
+  std::fill(validities.begin() + (N / 2), validities.end(), 0);
+
+  std::random_device rd;
+  std::mt19937 gen(rd());
+  std::shuffle(validities.begin(), validities.end(), gen);
+
+  auto col   = column_wrapper<int32_t>(a, a + N, validities.begin());
+  auto table = cudf::table_view{{col}};
+
+  auto col_ref    = cudf::ast::column_reference(0);
+  auto expression = cudf::ast::expression(cudf::ast::ast_operator::ADD, col_ref, col_ref);
+
+  auto b        = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i * 2; });
+  auto expected = column_wrapper<int32_t>(b, b + N, validities.begin());
+  auto result   = cudf::ast::compute_column(table, expression);
+
+  cudf::test::expect_columns_equal(expected, result->view(), true);
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/binaryop/assert-binops.h b/cpp/tests/binaryop/assert-binops.h
index 9e762a1c987..65859251e42 100644
--- a/cpp/tests/binaryop/assert-binops.h
+++ b/cpp/tests/binaryop/assert-binops.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Copyright 2018-2019 BlazingDB, Inc.
  *     Copyright 2018 Christian Noboa Mardini <christian@blazingdb.com>
@@ -36,28 +36,21 @@ namespace binop {
 // result returned by the binop operation into string, which is then used for display purposes
 // when the values do not match.
 struct stringify_out_values {
-  template <typename TypeOut, std::enable_if_t<!is_chrono<TypeOut>()>* = nullptr>
-  std::string operator()(TypeOut lhs, TypeOut rhs) const
+  template <typename TypeOut>
+  std::string operator()(size_type i, TypeOut lhs, TypeOut rhs) const
   {
     std::stringstream out_str;
-    out_str << "lhs: " << lhs << "\nrhs: " << rhs;
-    return out_str.str();
-  }
-
-  template <typename TypeOut, std::enable_if_t<is_timestamp<TypeOut>()>* = nullptr>
-  std::string operator()(TypeOut lhs, TypeOut rhs) const
-  {
-    std::stringstream out_str;
-    out_str << "lhs: " << lhs.time_since_epoch().count()
-            << "\nrhs: " << rhs.time_since_epoch().count();
-    return out_str.str();
-  }
-
-  template <typename TypeOut, std::enable_if_t<is_duration<TypeOut>()>* = nullptr>
-  std::string operator()(TypeOut lhs, TypeOut rhs) const
-  {
-    std::stringstream out_str;
-    out_str << "lhs: " << lhs.count() << "\nrhs: " << rhs.count();
+    out_str << "[" << i << "]:\n";
+    if constexpr (is_fixed_point<TypeOut>()) {
+      out_str << "lhs: " << std::string(lhs) << "\nrhs: " << std::string(rhs);
+    } else if constexpr (is_timestamp<TypeOut>()) {
+      out_str << "lhs: " << lhs.time_since_epoch().count()
+              << "\nrhs: " << rhs.time_since_epoch().count();
+    } else if constexpr (is_duration<TypeOut>()) {
+      out_str << "lhs: " << lhs.count() << "\nrhs: " << rhs.count();
+    } else {
+      out_str << "lhs: " << lhs << "\nrhs: " << rhs;
+    }
     return out_str.str();
   }
 };
@@ -101,7 +94,7 @@ void ASSERT_BINOP(column_view const& out,
   for (size_t i = 0; i < out_data.size(); ++i) {
     auto lhs = out_data[i];
     auto rhs = (TypeOut)(op(lhs_h, rhs_data[i]));
-    ASSERT_TRUE(value_comparator(lhs, rhs)) << stringify_out_values{}(lhs, rhs);
+    ASSERT_TRUE(value_comparator(lhs, rhs)) << stringify_out_values{}(i, lhs, rhs);
   }
 
   if (rhs.nullable()) {
@@ -148,7 +141,7 @@ void ASSERT_BINOP(column_view const& out,
   for (size_t i = 0; i < out_data.size(); ++i) {
     auto lhs = out_data[i];
     auto rhs = (TypeOut)(op(lhs_data[i], rhs_h));
-    ASSERT_TRUE(value_comparator(lhs, rhs)) << stringify_out_values{}(lhs, rhs);
+    ASSERT_TRUE(value_comparator(lhs, rhs)) << stringify_out_values{}(i, lhs, rhs);
   }
 
   if (lhs.nullable()) {
@@ -196,7 +189,7 @@ void ASSERT_BINOP(column_view const& out,
   for (size_t i = 0; i < out_data.size(); ++i) {
     auto lhs = out_data[i];
     auto rhs = (TypeOut)(op(lhs_data[i], rhs_data[i]));
-    ASSERT_TRUE(value_comparator(lhs, rhs)) << stringify_out_values{}(lhs, rhs);
+    ASSERT_TRUE(value_comparator(lhs, rhs)) << stringify_out_values{}(i, lhs, rhs);
   }
 
   if (lhs.nullable() and rhs.nullable()) {
diff --git a/cpp/tests/binaryop/binop-compiled-fixed_point-test.cpp b/cpp/tests/binaryop/binop-compiled-fixed_point-test.cpp
new file mode 100644
index 00000000000..feb75cc3f09
--- /dev/null
+++ b/cpp/tests/binaryop/binop-compiled-fixed_point-test.cpp
@@ -0,0 +1,721 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/binaryop.hpp>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/fixed_point/fixed_point.hpp>
+#include <cudf/scalar/scalar_factories.hpp>
+#include <cudf/types.hpp>
+#include <cudf/unary.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
+
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <tests/binaryop/assert-binops.h>
+#include <tests/binaryop/binop-fixture.hpp>
+#include "cudf/utilities/error.hpp"
+
+namespace cudf::test::binop {
+
+template <typename T>
+struct FixedPointCompiledTestBothReps : public cudf::test::BaseFixture {
+};
+
+template <typename T>
+using wrapper = cudf::test::fixed_width_column_wrapper<T>;
+TYPED_TEST_CASE(FixedPointCompiledTestBothReps, cudf::test::FixedPointTypes);
+
+TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpAdd)
+{
+  using namespace numeric;
+  using decimalXX = TypeParam;
+
+  auto const sz = std::size_t{1000};
+
+  auto begin      = cudf::detail::make_counting_transform_iterator(1, [](auto i) {
+    return decimalXX{i, scale_type{0}};
+  });
+  auto const vec1 = std::vector<decimalXX>(begin, begin + sz);
+  auto const vec2 = std::vector<decimalXX>(sz, decimalXX{2, scale_type{0}});
+  auto expected   = std::vector<decimalXX>(sz);
+
+  std::transform(std::cbegin(vec1),
+                 std::cend(vec1),
+                 std::cbegin(vec2),
+                 std::begin(expected),
+                 std::plus<decimalXX>());
+
+  auto const lhs          = wrapper<decimalXX>(vec1.begin(), vec1.end());
+  auto const rhs          = wrapper<decimalXX>(vec2.begin(), vec2.end());
+  auto const expected_col = wrapper<decimalXX>(expected.begin(), expected.end());
+
+  auto const type =
+    cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::ADD,
+                                                   static_cast<cudf::column_view>(lhs).type(),
+                                                   static_cast<cudf::column_view>(rhs).type());
+  auto const result =
+    cudf::experimental::binary_operation(lhs, rhs, cudf::binary_operator::ADD, type);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_col, result->view());
+}
+
+TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpMultiply)
+{
+  using namespace numeric;
+  using decimalXX = TypeParam;
+
+  auto const sz = std::size_t{1000};
+
+  auto begin      = cudf::detail::make_counting_transform_iterator(1, [](auto i) {
+    return decimalXX{i, scale_type{0}};
+  });
+  auto const vec1 = std::vector<decimalXX>(begin, begin + sz);
+  auto const vec2 = std::vector<decimalXX>(sz, decimalXX{2, scale_type{0}});
+  auto expected   = std::vector<decimalXX>(sz);
+
+  std::transform(std::cbegin(vec1),
+                 std::cend(vec1),
+                 std::cbegin(vec2),
+                 std::begin(expected),
+                 std::multiplies<decimalXX>());
+
+  auto const lhs          = wrapper<decimalXX>(vec1.begin(), vec1.end());
+  auto const rhs          = wrapper<decimalXX>(vec2.begin(), vec2.end());
+  auto const expected_col = wrapper<decimalXX>(expected.begin(), expected.end());
+
+  auto const type =
+    cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::MUL,
+                                                   static_cast<cudf::column_view>(lhs).type(),
+                                                   static_cast<cudf::column_view>(rhs).type());
+  auto const result =
+    cudf::experimental::binary_operation(lhs, rhs, cudf::binary_operator::MUL, type);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_col, result->view());
+}
+
+template <typename T>
+using fp_wrapper = cudf::test::fixed_point_column_wrapper<T>;
+
+TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpMultiply2)
+{
+  using namespace numeric;
+  using decimalXX = TypeParam;
+  using RepType   = device_storage_type_t<decimalXX>;
+
+  auto const lhs      = fp_wrapper<RepType>{{11, 22, 33, 44, 55}, scale_type{-1}};
+  auto const rhs      = fp_wrapper<RepType>{{10, 10, 10, 10, 10}, scale_type{0}};
+  auto const expected = fp_wrapper<RepType>{{110, 220, 330, 440, 550}, scale_type{-1}};
+
+  auto const type =
+    cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::MUL,
+                                                   static_cast<cudf::column_view>(lhs).type(),
+                                                   static_cast<cudf::column_view>(rhs).type());
+  auto const result =
+    cudf::experimental::binary_operation(lhs, rhs, cudf::binary_operator::MUL, type);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
+TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpDiv)
+{
+  using namespace numeric;
+  using decimalXX = TypeParam;
+  using RepType   = device_storage_type_t<decimalXX>;
+
+  auto const lhs      = fp_wrapper<RepType>{{10, 30, 50, 70}, scale_type{-1}};
+  auto const rhs      = fp_wrapper<RepType>{{4, 4, 4, 4}, scale_type{0}};
+  auto const expected = fp_wrapper<RepType>{{2, 7, 12, 17}, scale_type{-1}};
+
+  auto const type =
+    cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::DIV,
+                                                   static_cast<cudf::column_view>(lhs).type(),
+                                                   static_cast<cudf::column_view>(rhs).type());
+  auto const result =
+    cudf::experimental::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
+TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpDiv2)
+{
+  using namespace numeric;
+  using decimalXX = TypeParam;
+  using RepType   = device_storage_type_t<decimalXX>;
+
+  auto const lhs      = fp_wrapper<RepType>{{10, 30, 50, 70}, scale_type{-1}};
+  auto const rhs      = fp_wrapper<RepType>{{4, 4, 4, 4}, scale_type{-2}};
+  auto const expected = fp_wrapper<RepType>{{2, 7, 12, 17}, scale_type{1}};
+
+  auto const type =
+    cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::DIV,
+                                                   static_cast<cudf::column_view>(lhs).type(),
+                                                   static_cast<cudf::column_view>(rhs).type());
+  auto const result =
+    cudf::experimental::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
+TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpDiv3)
+{
+  using namespace numeric;
+  using decimalXX = TypeParam;
+  using RepType   = device_storage_type_t<decimalXX>;
+
+  auto const lhs      = fp_wrapper<RepType>{{10, 30, 50, 70}, scale_type{-1}};
+  auto const rhs      = make_fixed_point_scalar<decimalXX>(12, scale_type{-1});
+  auto const expected = fp_wrapper<RepType>{{0, 2, 4, 5}, scale_type{0}};
+
+  auto const type = cudf::binary_operation_fixed_point_output_type(
+    cudf::binary_operator::DIV, static_cast<cudf::column_view>(lhs).type(), rhs->type());
+  auto const result =
+    cudf::experimental::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
+TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpDiv4)
+{
+  using namespace numeric;
+  using decimalXX = TypeParam;
+  using RepType   = device_storage_type_t<decimalXX>;
+
+  auto begin = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i * 11; });
+  auto result_begin =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i * 11) / 12; });
+  auto const lhs      = fp_wrapper<RepType>(begin, begin + 1000, scale_type{-1});
+  auto const rhs      = make_fixed_point_scalar<decimalXX>(12, scale_type{-1});
+  auto const expected = fp_wrapper<RepType>(result_begin, result_begin + 1000, scale_type{0});
+
+  auto const type = cudf::binary_operation_fixed_point_output_type(
+    cudf::binary_operator::DIV, static_cast<cudf::column_view>(lhs).type(), rhs->type());
+  auto const result =
+    cudf::experimental::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
+TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpAdd2)
+{
+  using namespace numeric;
+  using decimalXX = TypeParam;
+  using RepType   = device_storage_type_t<decimalXX>;
+
+  auto const lhs      = fp_wrapper<RepType>{{11, 22, 33, 44, 55}, scale_type{-1}};
+  auto const rhs      = fp_wrapper<RepType>{{100, 200, 300, 400, 500}, scale_type{-2}};
+  auto const expected = fp_wrapper<RepType>{{210, 420, 630, 840, 1050}, scale_type{-2}};
+
+  auto const type =
+    cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::ADD,
+                                                   static_cast<cudf::column_view>(lhs).type(),
+                                                   static_cast<cudf::column_view>(rhs).type());
+  auto const result =
+    cudf::experimental::binary_operation(lhs, rhs, cudf::binary_operator::ADD, type);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
+TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpAdd3)
+{
+  using namespace numeric;
+  using decimalXX = TypeParam;
+  using RepType   = device_storage_type_t<decimalXX>;
+
+  auto const lhs      = fp_wrapper<RepType>{{1100, 2200, 3300, 4400, 5500}, scale_type{-3}};
+  auto const rhs      = fp_wrapper<RepType>{{100, 200, 300, 400, 500}, scale_type{-2}};
+  auto const expected = fp_wrapper<RepType>{{2100, 4200, 6300, 8400, 10500}, scale_type{-3}};
+
+  auto const type =
+    cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::ADD,
+                                                   static_cast<cudf::column_view>(lhs).type(),
+                                                   static_cast<cudf::column_view>(rhs).type());
+  auto const result =
+    cudf::experimental::binary_operation(lhs, rhs, cudf::binary_operator::ADD, type);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
+TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpAdd4)
+{
+  using namespace numeric;
+  using decimalXX = TypeParam;
+  using RepType   = device_storage_type_t<decimalXX>;
+
+  auto const lhs      = fp_wrapper<RepType>{{11, 22, 33, 44, 55}, scale_type{-1}};
+  auto const rhs      = make_fixed_point_scalar<decimalXX>(100, scale_type{-2});
+  auto const expected = fp_wrapper<RepType>{{210, 320, 430, 540, 650}, scale_type{-2}};
+
+  auto const type = cudf::binary_operation_fixed_point_output_type(
+    cudf::binary_operator::ADD, static_cast<cudf::column_view>(lhs).type(), rhs->type());
+  auto const result =
+    cudf::experimental::binary_operation(lhs, *rhs, cudf::binary_operator::ADD, type);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
+TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpAdd5)
+{
+  using namespace numeric;
+  using decimalXX = TypeParam;
+  using RepType   = device_storage_type_t<decimalXX>;
+
+  auto const lhs      = make_fixed_point_scalar<decimalXX>(100, scale_type{-2});
+  auto const rhs      = fp_wrapper<RepType>{{11, 22, 33, 44, 55}, scale_type{-1}};
+  auto const expected = fp_wrapper<RepType>{{210, 320, 430, 540, 650}, scale_type{-2}};
+
+  auto const type = cudf::binary_operation_fixed_point_output_type(
+    cudf::binary_operator::ADD, lhs->type(), static_cast<cudf::column_view>(rhs).type());
+  auto const result =
+    cudf::experimental::binary_operation(*lhs, rhs, cudf::binary_operator::ADD, type);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
+TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpAdd6)
+{
+  using namespace numeric;
+  using decimalXX = TypeParam;
+  using RepType   = device_storage_type_t<decimalXX>;
+
+  auto const col = fp_wrapper<RepType>{{30, 4, 5, 6, 7, 8}, scale_type{0}};
+
+  auto const expected1 = fp_wrapper<RepType>{{60, 8, 10, 12, 14, 16}, scale_type{0}};
+  auto const expected2 = fp_wrapper<RepType>{{6, 0, 1, 1, 1, 1}, scale_type{1}};
+  auto const type1     = cudf::data_type{cudf::type_to_id<decimalXX>(), 0};
+  auto const type2     = cudf::data_type{cudf::type_to_id<decimalXX>(), 1};
+  auto const result1 =
+    cudf::experimental::binary_operation(col, col, cudf::binary_operator::ADD, type1);
+  auto const result2 =
+    cudf::experimental::binary_operation(col, col, cudf::binary_operator::ADD, type2);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, result2->view());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected1, result1->view());
+}
+
+TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointCast)
+{
+  using namespace numeric;
+  using decimalXX = TypeParam;
+  using RepType   = device_storage_type_t<decimalXX>;
+
+  auto const col      = fp_wrapper<RepType>{{6, 8, 10, 12, 14, 16}, scale_type{0}};
+  auto const expected = fp_wrapper<RepType>{{0, 0, 1, 1, 1, 1}, scale_type{1}};
+  auto const type     = cudf::data_type{cudf::type_to_id<decimalXX>(), 1};
+  auto const result   = cudf::cast(col, type);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
+TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpMultiplyScalar)
+{
+  using namespace numeric;
+  using decimalXX = TypeParam;
+  using RepType   = device_storage_type_t<decimalXX>;
+
+  auto const lhs      = fp_wrapper<RepType>{{11, 22, 33, 44, 55}, scale_type{-1}};
+  auto const rhs      = make_fixed_point_scalar<decimalXX>(100, scale_type{-1});
+  auto const expected = fp_wrapper<RepType>{{1100, 2200, 3300, 4400, 5500}, scale_type{-2}};
+
+  auto const type = cudf::binary_operation_fixed_point_output_type(
+    cudf::binary_operator::MUL, static_cast<cudf::column_view>(lhs).type(), rhs->type());
+  auto const result =
+    cudf::experimental::binary_operation(lhs, *rhs, cudf::binary_operator::MUL, type);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
+TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpSimplePlus)
+{
+  using namespace numeric;
+  using decimalXX = TypeParam;
+  using RepType   = device_storage_type_t<decimalXX>;
+
+  auto const lhs      = fp_wrapper<RepType>{{150, 200}, scale_type{-2}};
+  auto const rhs      = fp_wrapper<RepType>{{2250, 1005}, scale_type{-3}};
+  auto const expected = fp_wrapper<RepType>{{3750, 3005}, scale_type{-3}};
+
+  auto const type =
+    cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::ADD,
+                                                   static_cast<cudf::column_view>(lhs).type(),
+                                                   static_cast<cudf::column_view>(rhs).type());
+  auto const result =
+    cudf::experimental::binary_operation(lhs, rhs, cudf::binary_operator::ADD, type);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
+TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpEqualSimple)
+{
+  using namespace numeric;
+  using decimalXX = TypeParam;
+  using RepType   = device_storage_type_t<decimalXX>;
+
+  auto const trues    = std::vector<bool>(4, true);
+  auto const col1     = fp_wrapper<RepType>{{1, 2, 3, 4}, scale_type{0}};
+  auto const col2     = fp_wrapper<RepType>{{100, 200, 300, 400}, scale_type{-2}};
+  auto const expected = wrapper<bool>(trues.begin(), trues.end());
+
+  auto const result = cudf::experimental::binary_operation(
+    col1, col2, binary_operator::EQUAL, cudf::data_type{type_id::BOOL8});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
+TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpEqualSimpleScale0)
+{
+  using namespace numeric;
+  using decimalXX = TypeParam;
+  using RepType   = device_storage_type_t<decimalXX>;
+
+  auto const trues    = std::vector<bool>(4, true);
+  auto const col      = fp_wrapper<RepType>{{1, 2, 3, 4}, scale_type{0}};
+  auto const expected = wrapper<bool>(trues.begin(), trues.end());
+
+  auto const result = cudf::experimental::binary_operation(
+    col, col, binary_operator::EQUAL, cudf::data_type{type_id::BOOL8});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
+TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpEqualSimpleScale0Null)
+{
+  using namespace numeric;
+  using decimalXX = TypeParam;
+  using RepType   = device_storage_type_t<decimalXX>;
+
+  auto const col1     = fp_wrapper<RepType>{{1, 2, 3, 4}, {1, 1, 1, 1}, scale_type{0}};
+  auto const col2     = fp_wrapper<RepType>{{1, 2, 3, 4}, {0, 0, 0, 0}, scale_type{0}};
+  auto const expected = wrapper<bool>{{0, 1, 0, 1}, {0, 0, 0, 0}};
+
+  auto const result = cudf::experimental::binary_operation(
+    col1, col2, binary_operator::EQUAL, cudf::data_type{type_id::BOOL8});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
+TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpEqualSimpleScale2Null)
+{
+  using namespace numeric;
+  using decimalXX = TypeParam;
+  using RepType   = device_storage_type_t<decimalXX>;
+
+  auto const col1     = fp_wrapper<RepType>{{1, 2, 3, 4}, {1, 1, 1, 1}, scale_type{-2}};
+  auto const col2     = fp_wrapper<RepType>{{1, 2, 3, 4}, {0, 0, 0, 0}, scale_type{0}};
+  auto const expected = wrapper<bool>{{0, 1, 0, 1}, {0, 0, 0, 0}};
+
+  auto const result = cudf::experimental::binary_operation(
+    col1, col2, binary_operator::EQUAL, cudf::data_type{type_id::BOOL8});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
+TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpEqualLessGreater)
+{
+  using namespace numeric;
+  using decimalXX = TypeParam;
+  using RepType   = device_storage_type_t<decimalXX>;
+
+  auto const sz = std::size_t{1000};
+
+  // TESTING binary op ADD
+
+  auto begin = cudf::detail::make_counting_transform_iterator(1, [](auto e) { return e * 1000; });
+  auto const vec1 = std::vector<RepType>(begin, begin + sz);
+  auto const vec2 = std::vector<RepType>(sz, 0);
+
+  auto const iota_3  = fp_wrapper<RepType>(vec1.begin(), vec1.end(), scale_type{-3});
+  auto const zeros_3 = fp_wrapper<RepType>(vec2.begin(), vec2.end(), scale_type{-1});
+
+  auto const type =
+    cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::ADD,
+                                                   static_cast<cudf::column_view>(iota_3).type(),
+                                                   static_cast<cudf::column_view>(zeros_3).type());
+  auto const iota_3_after_add =
+    cudf::experimental::binary_operation(zeros_3, iota_3, binary_operator::ADD, type);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(iota_3, iota_3_after_add->view());
+
+  // TESTING binary op EQUAL, LESS, GREATER
+
+  auto const trues    = std::vector<bool>(sz, true);
+  auto const true_col = wrapper<bool>(trues.begin(), trues.end());
+
+  auto const btype        = cudf::data_type{type_id::BOOL8};
+  auto const equal_result = cudf::experimental::binary_operation(
+    iota_3, iota_3_after_add->view(), binary_operator::EQUAL, btype);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(true_col, equal_result->view());
+
+  auto const less_result = cudf::experimental::binary_operation(
+    zeros_3, iota_3_after_add->view(), binary_operator::LESS, btype);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(true_col, less_result->view());
+
+  auto const greater_result = cudf::experimental::binary_operation(
+    iota_3_after_add->view(), zeros_3, binary_operator::GREATER, btype);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(true_col, greater_result->view());
+}
+
+TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpNullMaxSimple)
+{
+  using namespace numeric;
+  using decimalXX = TypeParam;
+  using RepType   = device_storage_type_t<decimalXX>;
+
+  auto const trues    = std::vector<bool>(4, true);
+  auto const col1     = fp_wrapper<RepType>{{40, 30, 20, 10, 0}, {1, 0, 1, 1, 0}, scale_type{-2}};
+  auto const col2     = fp_wrapper<RepType>{{10, 20, 30, 40, 0}, {1, 1, 1, 0, 0}, scale_type{-2}};
+  auto const expected = fp_wrapper<RepType>{{40, 20, 30, 10, 0}, {1, 1, 1, 1, 0}, scale_type{-2}};
+
+  auto const type =
+    cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::NULL_MAX,
+                                                   static_cast<cudf::column_view>(col1).type(),
+                                                   static_cast<cudf::column_view>(col2).type());
+  auto const result =
+    cudf::experimental::binary_operation(col1, col2, binary_operator::NULL_MAX, type);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
+TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpNullMinSimple)
+{
+  using namespace numeric;
+  using decimalXX = TypeParam;
+  using RepType   = device_storage_type_t<decimalXX>;
+
+  auto const trues    = std::vector<bool>(4, true);
+  auto const col1     = fp_wrapper<RepType>{{40, 30, 20, 10, 0}, {1, 1, 1, 0, 0}, scale_type{-1}};
+  auto const col2     = fp_wrapper<RepType>{{10, 20, 30, 40, 0}, {1, 0, 1, 1, 0}, scale_type{-1}};
+  auto const expected = fp_wrapper<RepType>{{10, 30, 20, 40, 0}, {1, 1, 1, 1, 0}, scale_type{-1}};
+
+  auto const type =
+    cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::NULL_MIN,
+                                                   static_cast<cudf::column_view>(col1).type(),
+                                                   static_cast<cudf::column_view>(col2).type());
+  auto const result =
+    cudf::experimental::binary_operation(col1, col2, binary_operator::NULL_MIN, type);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
+TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpNullEqualsSimple)
+{
+  using namespace numeric;
+  using decimalXX = TypeParam;
+  using RepType   = device_storage_type_t<decimalXX>;
+
+  auto const trues    = std::vector<bool>(4, true);
+  auto const col1     = fp_wrapper<RepType>{{400, 300, 300, 100}, {1, 1, 1, 0}, scale_type{-2}};
+  auto const col2     = fp_wrapper<RepType>{{40, 200, 20, 400}, {1, 0, 1, 0}, scale_type{-1}};
+  auto const expected = wrapper<bool>{{1, 0, 0, 1}, {1, 1, 1, 1}};
+
+  auto const result = cudf::experimental::binary_operation(
+    col1, col2, binary_operator::NULL_EQUALS, cudf::data_type{type_id::BOOL8});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
+TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOp_Div)
+{
+  using namespace numeric;
+  using decimalXX = TypeParam;
+  using RepType   = device_storage_type_t<decimalXX>;
+
+  auto const lhs      = fp_wrapper<RepType>{{100, 300, 500, 700}, scale_type{-2}};
+  auto const rhs      = fp_wrapper<RepType>{{4, 4, 4, 4}, scale_type{0}};
+  auto const expected = fp_wrapper<RepType>{{25, 75, 125, 175}, scale_type{-2}};
+
+  auto const type = data_type{type_to_id<decimalXX>(), -2};
+  auto const result =
+    cudf::experimental::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
+TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOp_Div2)
+{
+  using namespace numeric;
+  using decimalXX = TypeParam;
+  using RepType   = device_storage_type_t<decimalXX>;
+
+  auto const lhs      = fp_wrapper<RepType>{{100000, 300000, 500000, 700000}, scale_type{-3}};
+  auto const rhs      = fp_wrapper<RepType>{{20, 20, 20, 20}, scale_type{-1}};
+  auto const expected = fp_wrapper<RepType>{{5000, 15000, 25000, 35000}, scale_type{-2}};
+
+  auto const type = data_type{type_to_id<decimalXX>(), -2};
+  auto const result =
+    cudf::experimental::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
+TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOp_Div3)
+{
+  using namespace numeric;
+  using decimalXX = TypeParam;
+  using RepType   = device_storage_type_t<decimalXX>;
+
+  auto const lhs      = fp_wrapper<RepType>{{10000, 30000, 50000, 70000}, scale_type{-2}};
+  auto const rhs      = fp_wrapper<RepType>{{3, 9, 3, 3}, scale_type{0}};
+  auto const expected = fp_wrapper<RepType>{{3333, 3333, 16666, 23333}, scale_type{-2}};
+
+  auto const type = data_type{type_to_id<decimalXX>(), -2};
+  auto const result =
+    cudf::experimental::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
+TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOp_Div4)
+{
+  using namespace numeric;
+  using decimalXX = TypeParam;
+  using RepType   = device_storage_type_t<decimalXX>;
+
+  auto const lhs      = fp_wrapper<RepType>{{10, 30, 50, 70}, scale_type{1}};
+  auto const rhs      = make_fixed_point_scalar<decimalXX>(3, scale_type{0});
+  auto const expected = fp_wrapper<RepType>{{3, 10, 16, 23}, scale_type{1}};
+
+  auto const type = data_type{type_to_id<decimalXX>(), 1};
+  auto const result =
+    cudf::experimental::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
+TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOp_Div6)
+{
+  using namespace numeric;
+  using decimalXX = TypeParam;
+  using RepType   = device_storage_type_t<decimalXX>;
+
+  auto const lhs = make_fixed_point_scalar<decimalXX>(3000, scale_type{-3});
+  auto const rhs = fp_wrapper<RepType>{{10, 30, 50, 70}, scale_type{-1}};
+
+  auto const expected = fp_wrapper<RepType>{{300, 100, 60, 42}, scale_type{-2}};
+
+  auto const type = data_type{type_to_id<decimalXX>(), -2};
+  auto const result =
+    cudf::experimental::binary_operation(*lhs, rhs, cudf::binary_operator::DIV, type);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
+TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOp_Div7)
+{
+  using namespace numeric;
+  using decimalXX = TypeParam;
+  using RepType   = device_storage_type_t<decimalXX>;
+
+  auto const lhs = make_fixed_point_scalar<decimalXX>(1200, scale_type{0});
+  auto const rhs = fp_wrapper<RepType>{{100, 200, 300, 500, 600, 800, 1200, 1300}, scale_type{-2}};
+
+  auto const expected = fp_wrapper<RepType>{{12, 6, 4, 2, 2, 1, 1, 0}, scale_type{2}};
+
+  auto const type = data_type{type_to_id<decimalXX>(), 2};
+  auto const result =
+    cudf::experimental::binary_operation(*lhs, rhs, cudf::binary_operator::DIV, type);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
+TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOp_Div8)
+{
+  using namespace numeric;
+  using decimalXX = TypeParam;
+  using RepType   = device_storage_type_t<decimalXX>;
+
+  auto const lhs      = fp_wrapper<RepType>{{4000, 6000, 80000}, scale_type{-1}};
+  auto const rhs      = make_fixed_point_scalar<decimalXX>(5000, scale_type{-3});
+  auto const expected = fp_wrapper<RepType>{{0, 1, 16}, scale_type{2}};
+
+  auto const type = data_type{type_to_id<decimalXX>(), 2};
+  auto const result =
+    cudf::experimental::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
+TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOp_Div9)
+{
+  using namespace numeric;
+  using decimalXX = TypeParam;
+  using RepType   = device_storage_type_t<decimalXX>;
+
+  auto const lhs      = fp_wrapper<RepType>{{10, 20, 30}, scale_type{2}};
+  auto const rhs      = make_fixed_point_scalar<decimalXX>(7, scale_type{1});
+  auto const expected = fp_wrapper<RepType>{{1, 2, 4}, scale_type{1}};
+
+  auto const type = data_type{type_to_id<decimalXX>(), 1};
+  auto const result =
+    cudf::experimental::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
+TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOp_Div10)
+{
+  using namespace numeric;
+  using decimalXX = TypeParam;
+  using RepType   = device_storage_type_t<decimalXX>;
+
+  auto const lhs      = fp_wrapper<RepType>{{100, 200, 300}, scale_type{1}};
+  auto const rhs      = make_fixed_point_scalar<decimalXX>(7, scale_type{0});
+  auto const expected = fp_wrapper<RepType>{{14, 28, 42}, scale_type{1}};
+
+  auto const type = data_type{type_to_id<decimalXX>(), 1};
+  auto const result =
+    cudf::experimental::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
+TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOp_Div11)
+{
+  using namespace numeric;
+  using decimalXX = TypeParam;
+  using RepType   = device_storage_type_t<decimalXX>;
+
+  auto const lhs      = fp_wrapper<RepType>{{1000, 2000, 3000}, scale_type{1}};
+  auto const rhs      = fp_wrapper<RepType>{{7, 7, 7}, scale_type{0}};
+  auto const expected = fp_wrapper<RepType>{{142, 285, 428}, scale_type{1}};
+
+  auto const type = data_type{type_to_id<decimalXX>(), 1};
+  auto const result =
+    cudf::experimental::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
+TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpThrows)
+{
+  using namespace numeric;
+  using decimalXX = TypeParam;
+  using RepType   = device_storage_type_t<decimalXX>;
+
+  auto const col           = fp_wrapper<RepType>{{100, 300, 500, 700}, scale_type{-2}};
+  auto const non_bool_type = data_type{type_to_id<decimalXX>(), -2};
+  auto const float_type    = data_type{type_id::FLOAT32};
+  EXPECT_THROW(
+    cudf::experimental::binary_operation(col, col, cudf::binary_operator::LESS, non_bool_type),
+    cudf::logic_error);
+  // Allowed now, but not allowed in jit.
+  // EXPECT_THROW(cudf::experimental::binary_operation(col, col, cudf::binary_operator::MUL,
+  // float_type),
+  //              cudf::logic_error);
+}
+
+}  // namespace cudf::test::binop
diff --git a/cpp/tests/binaryop/binop-compiled-test.cpp b/cpp/tests/binaryop/binop-compiled-test.cpp
new file mode 100644
index 00000000000..081ae41fef1
--- /dev/null
+++ b/cpp/tests/binaryop/binop-compiled-test.cpp
@@ -0,0 +1,610 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/binaryop.hpp>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/fixed_point/fixed_point.hpp>
+#include <cudf/scalar/scalar_factories.hpp>
+#include <cudf/types.hpp>
+#include <cudf/unary.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
+
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <tests/binaryop/assert-binops.h>
+#include <tests/binaryop/binop-fixture.hpp>
+#include "cudf/utilities/error.hpp"
+
+#include <type_traits>
+
+namespace cudf::test::binop {
+
+template <typename T>
+auto lhs_random_column(size_type size)
+{
+  return BinaryOperationTest::make_random_wrapped_column<T>(size);
+}
+
+template <>
+auto lhs_random_column<std::string>(size_type size)
+{
+  return cudf::test::strings_column_wrapper({"eee", "bb", "<null>", "", "aa", "bbb", "ééé"},
+                                            {1, 1, 0, 1, 1, 1, 1});
+}
+template <typename T>
+auto rhs_random_column(size_type size)
+{
+  return BinaryOperationTest::make_random_wrapped_column<T>(size);
+}
+template <>
+auto rhs_random_column<std::string>(size_type size)
+{
+  return cudf::test::strings_column_wrapper({"ééé", "bbb", "aa", "", "<null>", "bb", "eee"},
+                                            {1, 1, 1, 1, 0, 1, 1});
+}
+
+// combinations to test
+//     n  t   d
+// n n.n n.t n.d
+// t t.n t.t t.d
+// d d.n d.t d.d
+
+constexpr size_type col_size = 10000;
+template <typename T>
+struct BinaryOperationCompiledTest : public BinaryOperationTest {
+  using TypeOut = cudf::test::GetType<T, 0>;
+  using TypeLhs = cudf::test::GetType<T, 1>;
+  using TypeRhs = cudf::test::GetType<T, 2>;
+
+  template <template <typename... Ty> class FunctorOP>
+  void test(cudf::binary_operator op)
+  {
+    using OPERATOR = FunctorOP<TypeOut, TypeLhs, TypeRhs>;
+
+    auto lhs = lhs_random_column<TypeLhs>(col_size);
+    auto rhs = rhs_random_column<TypeRhs>(col_size);
+
+    auto out = cudf::experimental::binary_operation(lhs, rhs, op, data_type(type_to_id<TypeOut>()));
+    ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, OPERATOR());
+
+    auto s_lhs = this->template make_random_wrapped_scalar<TypeLhs>();
+    auto s_rhs = this->template make_random_wrapped_scalar<TypeRhs>();
+
+    out = cudf::experimental::binary_operation(lhs, s_rhs, op, data_type(type_to_id<TypeOut>()));
+    ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, s_rhs, OPERATOR());
+    out = cudf::experimental::binary_operation(s_lhs, rhs, op, data_type(type_to_id<TypeOut>()));
+    ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, s_lhs, rhs, OPERATOR());
+  }
+};
+
+// ADD
+//     n      t     d
+// n n + n
+// t      	     	t + d
+// d      	d + t	d + d
+
+using Add_types =
+  cudf::test::Types<cudf::test::Types<bool, bool, float>,
+                    cudf::test::Types<int16_t, double, uint8_t>,
+                    cudf::test::Types<timestamp_s, timestamp_s, duration_s>,
+                    cudf::test::Types<timestamp_ns, duration_ms, timestamp_us>,
+                    cudf::test::Types<duration_us, duration_us, duration_D>,
+                    // cudf::test::Types<duration_s, int16_t, int64_t>, //valid
+                    cudf::test::Types<numeric::decimal32, numeric::decimal32, numeric::decimal32>,
+                    cudf::test::Types<int, numeric::decimal32, numeric::decimal32>,
+                    // Extras
+                    cudf::test::Types<duration_D, duration_D, duration_D>,
+                    cudf::test::Types<timestamp_D, timestamp_D, duration_D>,
+                    cudf::test::Types<timestamp_s, timestamp_D, duration_s>,
+                    cudf::test::Types<timestamp_ms, timestamp_ms, duration_s>,
+                    cudf::test::Types<timestamp_ns, timestamp_ms, duration_ns>>;
+template <typename T>
+struct BinaryOperationCompiledTest_Add : public BinaryOperationCompiledTest<T> {
+};
+TYPED_TEST_CASE(BinaryOperationCompiledTest_Add, Add_types);
+
+TYPED_TEST(BinaryOperationCompiledTest_Add, Vector_Vector)
+{
+  this->template test<cudf::library::operation::Add>(cudf::binary_operator::ADD);
+}
+
+// SUB
+//     n      t     d
+// n n - n
+// t      	t - t	t - d
+// d      	     	d - d
+
+using Sub_types =
+  cudf::test::Types<cudf::test::Types<int32_t, bool, float>,                  // n - n
+                    cudf::test::Types<duration_D, timestamp_D, timestamp_D>,  // t - t
+                    cudf::test::Types<timestamp_s, timestamp_D, duration_s>,  // t - d
+                    cudf::test::Types<duration_ns, duration_us, duration_s>,  // d - d
+                    cudf::test::Types<duration_us, duration_us, duration_s>,  // d - d
+                    cudf::test::Types<numeric::decimal32, numeric::decimal32, numeric::decimal32>,
+                    cudf::test::Types<int, numeric::decimal32, numeric::decimal32>>;
+template <typename T>
+struct BinaryOperationCompiledTest_Sub : public BinaryOperationCompiledTest<T> {
+};
+TYPED_TEST_CASE(BinaryOperationCompiledTest_Sub, Sub_types);
+
+TYPED_TEST(BinaryOperationCompiledTest_Sub, Vector_Vector)
+{
+  this->template test<cudf::library::operation::Sub>(cudf::binary_operator::SUB);
+}
+
+// MUL
+//     n      t     d
+// n n * n	     	n * d
+// t
+// d d * n
+using Mul_types =
+  cudf::test::Types<cudf::test::Types<int32_t, u_int64_t, float>,
+                    cudf::test::Types<duration_s, u_int64_t, duration_s>,
+                    cudf::test::Types<duration_ms, duration_D, int16_t>,
+                    cudf::test::Types<duration_ns, duration_us, uint8_t>,
+                    cudf::test::Types<numeric::decimal32, numeric::decimal32, numeric::decimal32>,
+                    cudf::test::Types<int, numeric::decimal32, numeric::decimal32>,
+                    cudf::test::Types<numeric::decimal32, int, int>>;
+template <typename T>
+struct BinaryOperationCompiledTest_Mul : public BinaryOperationCompiledTest<T> {
+};
+TYPED_TEST_CASE(BinaryOperationCompiledTest_Mul, Mul_types);
+
+TYPED_TEST(BinaryOperationCompiledTest_Mul, Vector_Vector)
+{
+  this->template test<cudf::library::operation::Mul>(cudf::binary_operator::MUL);
+}
+
+// DIV
+//     n      t     d
+// n n / n
+// t
+// d d / n	     	d / d
+using Div_types =
+  cudf::test::Types<cudf::test::Types<int16_t, u_int64_t, u_int64_t>,
+                    cudf::test::Types<double, int8_t, int64_t>,
+                    cudf::test::Types<duration_ms, duration_s, u_int32_t>,
+                    cudf::test::Types<duration_ns, duration_D, int16_t>,
+                    cudf::test::Types<double, duration_D, duration_ns>,
+                    cudf::test::Types<float, duration_ms, duration_ns>,
+                    cudf::test::Types<u_int64_t, duration_us, duration_ns>,
+                    cudf::test::Types<numeric::decimal32, numeric::decimal32, numeric::decimal32>,
+                    cudf::test::Types<int, numeric::decimal32, numeric::decimal32>>;
+template <typename T>
+struct BinaryOperationCompiledTest_Div : public BinaryOperationCompiledTest<T> {
+};
+TYPED_TEST_CASE(BinaryOperationCompiledTest_Div, Div_types);
+
+TYPED_TEST(BinaryOperationCompiledTest_Div, Vector_Vector)
+{
+  this->template test<cudf::library::operation::Div>(cudf::binary_operator::DIV);
+}
+
+// TRUE-DIV
+//     n      t     d
+// n n / n
+// t
+// d
+using TrueDiv_types =
+  cudf::test::Types<cudf::test::Types<int16_t, u_int64_t, u_int64_t>,
+                    cudf::test::Types<double, int8_t, int64_t>,
+                    cudf::test::Types<int8_t, bool, u_int32_t>,
+                    cudf::test::Types<u_int64_t, float, int16_t>,
+                    cudf::test::Types<numeric::decimal32, numeric::decimal32, numeric::decimal32>,
+                    cudf::test::Types<int, numeric::decimal32, numeric::decimal32>>;
+template <typename T>
+struct BinaryOperationCompiledTest_TrueDiv : public BinaryOperationCompiledTest<T> {
+};
+TYPED_TEST_CASE(BinaryOperationCompiledTest_TrueDiv, TrueDiv_types);
+
+TYPED_TEST(BinaryOperationCompiledTest_TrueDiv, Vector_Vector)
+{
+  this->template test<cudf::library::operation::TrueDiv>(cudf::binary_operator::TRUE_DIV);
+}
+// FLOOR_DIV
+//     n      t     d
+// n n / n
+// t
+// d
+TYPED_TEST(BinaryOperationCompiledTest_TrueDiv, FloorDiv_Vector_Vector)
+{
+  this->template test<cudf::library::operation::FloorDiv>(cudf::binary_operator::FLOOR_DIV);
+}
+
+// MOD
+//     n      t     d
+// n n % n
+// t
+// d d % n	     	d % d
+using Mod_types = cudf::test::Types<cudf::test::Types<int16_t, u_int64_t, u_int64_t>,
+                                    cudf::test::Types<double, int8_t, int64_t>,
+                                    cudf::test::Types<duration_ms, duration_s, u_int32_t>,
+                                    cudf::test::Types<duration_D, duration_D, int16_t>,
+                                    cudf::test::Types<duration_ns, duration_D, int16_t>,
+                                    cudf::test::Types<duration_ns, duration_us, duration_ns>>;
+template <typename T>
+struct BinaryOperationCompiledTest_Mod : public BinaryOperationCompiledTest<T> {
+};
+TYPED_TEST_CASE(BinaryOperationCompiledTest_Mod, Mod_types);
+
+TYPED_TEST(BinaryOperationCompiledTest_Mod, Vector_Vector)
+{
+  this->template test<cudf::library::operation::Mod>(cudf::binary_operator::MOD);
+}
+
+// PYMOD
+//     n      t     d
+// n n % n
+// t
+// d      	     	d % d
+using PyMod_types = cudf::test::Types<cudf::test::Types<int16_t, u_int64_t, u_int64_t>,
+                                      cudf::test::Types<double, int8_t, int64_t>,
+                                      cudf::test::Types<double, double, double>,
+                                      cudf::test::Types<duration_ns, duration_us, duration_ns>>;
+template <typename T>
+struct BinaryOperationCompiledTest_PyMod : public BinaryOperationCompiledTest<T> {
+};
+TYPED_TEST_CASE(BinaryOperationCompiledTest_PyMod, PyMod_types);
+TYPED_TEST(BinaryOperationCompiledTest_PyMod, Vector_Vector)
+{
+  this->template test<cudf::library::operation::PyMod>(cudf::binary_operator::PYMOD);
+}
+
+// POW
+//     n      t     d
+// n n ^ n
+// t
+// d
+
+using Pow_types = cudf::test::Types<cudf::test::Types<double, int64_t, int64_t>,
+                                    cudf::test::Types<float, float, float>,
+                                    cudf::test::Types<int, int32_t, float>,
+                                    cudf::test::Types<float, int, int>,
+                                    cudf::test::Types<double, int64_t, int32_t>,
+                                    cudf::test::Types<double, double, double>,
+                                    cudf::test::Types<double, float, double>,
+                                    cudf::test::Types<double, int32_t, int64_t>>;
+
+template <typename T>
+struct BinaryOperationCompiledTest_FloatOps : public BinaryOperationCompiledTest<T> {
+};
+TYPED_TEST_CASE(BinaryOperationCompiledTest_FloatOps, Pow_types);
+
+TYPED_TEST(BinaryOperationCompiledTest_FloatOps, Pow_Vector_Vector)
+{
+  using TypeOut = typename TestFixture::TypeOut;
+  using TypeLhs = typename TestFixture::TypeLhs;
+  using TypeRhs = typename TestFixture::TypeRhs;
+
+  using POW = cudf::library::operation::Pow<TypeOut, TypeLhs, TypeRhs>;
+
+  auto lhs = []() {
+    // resulting value can not be represented by the target type => behavior is undefined
+    // -2147483648 in host, 2147483647 in device
+    if constexpr (std::is_same_v<TypeOut, int>) {
+      auto elements =
+        cudf::detail::make_counting_transform_iterator(1, [](auto i) { return i % 5; });
+      return fixed_width_column_wrapper<TypeLhs>(elements, elements + 100);
+    }
+    return lhs_random_column<TypeLhs>(100);
+  }();
+  auto rhs = rhs_random_column<TypeRhs>(100);
+
+  auto out = cudf::experimental::binary_operation(
+    lhs, rhs, cudf::binary_operator::POW, data_type(type_to_id<TypeOut>()));
+
+  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, POW(), NearEqualComparator<TypeOut>{2});
+}
+
+// LOG_BASE
+//     n      t     d
+// n log(n, n)
+// t
+// d
+TYPED_TEST(BinaryOperationCompiledTest_FloatOps, LogBase_Vector_Vector)
+{
+  using TypeOut = typename TestFixture::TypeOut;
+  using TypeLhs = typename TestFixture::TypeLhs;
+  using TypeRhs = typename TestFixture::TypeRhs;
+
+  using LOG_BASE = cudf::library::operation::LogBase<TypeOut, TypeLhs, TypeRhs>;
+
+  // Make sure there are no zeros
+  auto elements = cudf::detail::make_counting_transform_iterator(
+    1, [](auto i) { return sizeof(TypeLhs) > 4 ? std::pow(2, i) : i + 30; });
+  fixed_width_column_wrapper<TypeLhs> lhs(elements, elements + 50);
+
+  // Find log to the base 7
+  auto rhs_elements = cudf::detail::make_counting_transform_iterator(0, [](auto) { return 7; });
+  fixed_width_column_wrapper<TypeRhs> rhs(rhs_elements, rhs_elements + 50);
+
+  auto out = cudf::experimental::binary_operation(
+    lhs, rhs, cudf::binary_operator::LOG_BASE, data_type(type_to_id<TypeOut>()));
+
+  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, LOG_BASE());
+}
+
+// ATAN2
+//     n      t     d
+// n ATan2(n, n)
+// t
+// d
+TYPED_TEST(BinaryOperationCompiledTest_FloatOps, ATan2_Vector_Vector)
+{
+  using TypeOut = typename TestFixture::TypeOut;
+  using TypeLhs = typename TestFixture::TypeLhs;
+  using TypeRhs = typename TestFixture::TypeRhs;
+
+  using ATAN2 = cudf::library::operation::ATan2<TypeOut, TypeLhs, TypeRhs>;
+
+  auto lhs = lhs_random_column<TypeLhs>(col_size);
+  auto rhs = rhs_random_column<TypeRhs>(col_size);
+
+  auto out = cudf::experimental::binary_operation(
+    lhs, rhs, cudf::binary_operator::ATAN2, data_type(type_to_id<TypeOut>()));
+
+  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ATAN2(), NearEqualComparator<TypeOut>{2});
+}
+
+TYPED_TEST(BinaryOperationCompiledTest_FloatOps, PMod_Vector_Vector)
+{
+  this->template test<cudf::library::operation::PMod>(cudf::binary_operator::PMOD);
+}
+
+// Bit Operations
+//     n      t     d
+// n n . n
+// t
+// d
+
+// i.i, i.u, u.i, u.u -> i
+// i.i, i.u, u.i, u.u -> u
+using Bit_types = cudf::test::Types<cudf::test::Types<int16_t, int8_t, int16_t>,
+                                    cudf::test::Types<int64_t, int32_t, uint16_t>,
+                                    cudf::test::Types<int64_t, uint64_t, int64_t>,
+                                    cudf::test::Types<int16_t, uint32_t, uint8_t>,
+                                    // cudf::test::Types<bool, int8_t, uint8_t>, // valid
+                                    cudf::test::Types<uint16_t, int8_t, int16_t>,
+                                    cudf::test::Types<uint64_t, int32_t, uint16_t>,
+                                    cudf::test::Types<uint64_t, uint64_t, int64_t>,
+                                    cudf::test::Types<uint16_t, uint8_t, uint32_t>>;
+template <typename T>
+struct BinaryOperationCompiledTest_Bit : public BinaryOperationCompiledTest<T> {
+};
+TYPED_TEST_CASE(BinaryOperationCompiledTest_Bit, Bit_types);
+
+TYPED_TEST(BinaryOperationCompiledTest_Bit, BitwiseAnd_Vector_Vector)
+{
+  this->template test<cudf::library::operation::BitwiseAnd>(cudf::binary_operator::BITWISE_AND);
+}
+
+TYPED_TEST(BinaryOperationCompiledTest_Bit, BitwiseOr_Vector_Vector)
+{
+  this->template test<cudf::library::operation::BitwiseOr>(cudf::binary_operator::BITWISE_OR);
+}
+
+TYPED_TEST(BinaryOperationCompiledTest_Bit, BitwiseXor_Vector_Vector)
+{
+  this->template test<cudf::library::operation::BitwiseXor>(cudf::binary_operator::BITWISE_XOR);
+}
+
+TYPED_TEST(BinaryOperationCompiledTest_Bit, ShiftLeft_Vector_Vector)
+{
+  this->template test<cudf::library::operation::ShiftLeft>(cudf::binary_operator::SHIFT_LEFT);
+}
+
+TYPED_TEST(BinaryOperationCompiledTest_Bit, ShiftRight_Vector_Vector)
+{
+  this->template test<cudf::library::operation::ShiftRight>(cudf::binary_operator::SHIFT_RIGHT);
+}
+
+TYPED_TEST(BinaryOperationCompiledTest_Bit, ShiftRightUnsigned_Vector_Vector)
+{
+  this->template test<cudf::library::operation::ShiftRightUnsigned>(
+    cudf::binary_operator::SHIFT_RIGHT_UNSIGNED);
+}
+
+// Logical Operations
+//     n      t     d
+// n n . n
+// t
+// d
+using Logical_types = cudf::test::Types<cudf::test::Types<bool, int8_t, int16_t>,
+                                        cudf::test::Types<bool, int32_t, uint16_t>,
+                                        cudf::test::Types<bool, uint64_t, double>,
+                                        cudf::test::Types<bool, int8_t, int16_t>,
+                                        cudf::test::Types<bool, float, uint16_t>,
+                                        cudf::test::Types<bool, uint64_t, int64_t>,
+                                        cudf::test::Types<bool, uint8_t, uint32_t>,
+                                        cudf::test::Types<bool, uint64_t, int64_t>>;
+template <typename T>
+struct BinaryOperationCompiledTest_Logical : public BinaryOperationCompiledTest<T> {
+};
+TYPED_TEST_CASE(BinaryOperationCompiledTest_Logical, Logical_types);
+
+TYPED_TEST(BinaryOperationCompiledTest_Logical, LogicalAnd_Vector_Vector)
+{
+  this->template test<cudf::library::operation::LogicalAnd>(cudf::binary_operator::LOGICAL_AND);
+}
+
+TYPED_TEST(BinaryOperationCompiledTest_Logical, LogicalOr_Vector_Vector)
+{
+  this->template test<cudf::library::operation::LogicalOr>(cudf::binary_operator::LOGICAL_OR);
+}
+
+// Comparison Operations ==, !=, <, >, <=, >=
+// n<!=>n, t<!=>t, d<!=>d, s<!=>s, dc<!=>dc
+using Comparison_types =
+  cudf::test::Types<cudf::test::Types<bool, int8_t, int16_t>,
+                    cudf::test::Types<bool, uint32_t, uint16_t>,
+                    cudf::test::Types<bool, uint64_t, double>,
+                    cudf::test::Types<bool, timestamp_D, timestamp_s>,
+                    cudf::test::Types<bool, timestamp_ns, timestamp_us>,
+                    cudf::test::Types<bool, duration_ns, duration_ns>,
+                    cudf::test::Types<bool, duration_us, duration_s>,
+                    cudf::test::Types<bool, std::string, std::string>,
+                    cudf::test::Types<bool, numeric::decimal32, numeric::decimal32>>;
+
+template <typename T>
+struct BinaryOperationCompiledTest_Comparison : public BinaryOperationCompiledTest<T> {
+};
+TYPED_TEST_CASE(BinaryOperationCompiledTest_Comparison, Comparison_types);
+
+TYPED_TEST(BinaryOperationCompiledTest_Comparison, Equal_Vector_Vector)
+{
+  this->template test<cudf::library::operation::Equal>(cudf::binary_operator::EQUAL);
+}
+
+TYPED_TEST(BinaryOperationCompiledTest_Comparison, NotEqual_Vector_Vector)
+{
+  this->template test<cudf::library::operation::NotEqual>(cudf::binary_operator::NOT_EQUAL);
+}
+
+TYPED_TEST(BinaryOperationCompiledTest_Comparison, Less_Vector_Vector)
+{
+  this->template test<cudf::library::operation::Less>(cudf::binary_operator::LESS);
+}
+
+TYPED_TEST(BinaryOperationCompiledTest_Comparison, Greater_Vector_Vector)
+{
+  this->template test<cudf::library::operation::Greater>(cudf::binary_operator::GREATER);
+}
+
+TYPED_TEST(BinaryOperationCompiledTest_Comparison, LessEqual_Vector_Vector)
+{
+  this->template test<cudf::library::operation::LessEqual>(cudf::binary_operator::LESS_EQUAL);
+}
+
+TYPED_TEST(BinaryOperationCompiledTest_Comparison, GreaterEqual_Vector_Vector)
+{
+  this->template test<cudf::library::operation::GreaterEqual>(cudf::binary_operator::GREATER_EQUAL);
+}
+
+// Null Operations NullMax, NullMin
+// Min(n,n) , Min(t,t), Min(d, d), Min(s, s), Min(dc, dc), Min(n,dc), Min(dc, n)
+//    n   t   d  s  dc
+// n  .             .
+// t      .
+// d          .
+// s             .
+// dc .             .
+using Null_types =
+  cudf::test::Types<cudf::test::Types<int16_t, int8_t, int16_t>,
+                    cudf::test::Types<uint16_t, uint32_t, uint16_t>,
+                    cudf::test::Types<double, uint64_t, double>,
+                    cudf::test::Types<timestamp_s, timestamp_D, timestamp_s>,
+                    cudf::test::Types<duration_ns, duration_us, duration_s>,
+                    // cudf::test::Types<std::string, std::string, std::string>, // only fixed-width
+                    cudf::test::Types<numeric::decimal32, numeric::decimal32, numeric::decimal32>,
+                    cudf::test::Types<numeric::decimal32, uint32_t, numeric::decimal32>,
+                    cudf::test::Types<int64_t, numeric::decimal64, int64_t>>;
+
+template <typename T>
+struct BinaryOperationCompiledTest_NullOps : public BinaryOperationCompiledTest<T> {
+};
+TYPED_TEST_CASE(BinaryOperationCompiledTest_NullOps, Null_types);
+
+template <typename TypeOut, typename TypeLhs, typename TypeRhs, class OP>
+auto NullOp_Result(column_view lhs, column_view rhs)
+{
+  auto [lhs_data, lhs_mask] = cudf::test::to_host<TypeLhs>(lhs);
+  auto [rhs_data, rhs_mask] = cudf::test::to_host<TypeRhs>(rhs);
+  std::vector<TypeOut> result(lhs.size());
+  std::vector<bool> result_mask;
+  std::transform(thrust::make_counting_iterator(0),
+                 thrust::make_counting_iterator(lhs.size()),
+                 result.begin(),
+                 [&lhs_data, &lhs_mask, &rhs_data, &rhs_mask, &result_mask](auto i) -> TypeOut {
+                   auto lhs_valid    = lhs_mask.data() and cudf::bit_is_set(lhs_mask.data(), i);
+                   auto rhs_valid    = rhs_mask.data() and cudf::bit_is_set(rhs_mask.data(), i);
+                   bool output_valid = lhs_valid or rhs_valid;
+                   auto result = OP{}(lhs_data[i], rhs_data[i], lhs_valid, rhs_valid, output_valid);
+                   result_mask.push_back(output_valid);
+                   return result;
+                 });
+  return cudf::test::fixed_width_column_wrapper<TypeOut>(
+    result.cbegin(), result.cend(), result_mask.cbegin());
+}
+
+TYPED_TEST(BinaryOperationCompiledTest_NullOps, NullEquals_Vector_Vector)
+{
+  using TypeOut     = bool;
+  using TypeLhs     = typename TestFixture::TypeLhs;
+  using TypeRhs     = typename TestFixture::TypeRhs;
+  using NULL_EQUALS = cudf::library::operation::NullEquals<TypeOut, TypeLhs, TypeRhs>;
+
+  auto lhs            = lhs_random_column<TypeLhs>(col_size);
+  auto rhs            = rhs_random_column<TypeRhs>(col_size);
+  auto const expected = NullOp_Result<TypeOut, TypeLhs, TypeRhs, NULL_EQUALS>(lhs, rhs);
+
+  auto const result = cudf::experimental::binary_operation(
+    lhs, rhs, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
+using BinaryOperationCompiledTest_NullOpsString =
+  BinaryOperationCompiledTest_NullOps<cudf::test::Types<std::string, std::string, std::string>>;
+TEST_F(BinaryOperationCompiledTest_NullOpsString, NullEquals_Vector_Vector)
+{
+  using TypeOut     = bool;
+  using TypeLhs     = std::string;
+  using TypeRhs     = std::string;
+  using NULL_EQUALS = cudf::library::operation::NullEquals<TypeOut, TypeLhs, TypeRhs>;
+
+  auto lhs            = lhs_random_column<TypeLhs>(col_size);
+  auto rhs            = rhs_random_column<TypeRhs>(col_size);
+  auto const expected = NullOp_Result<TypeOut, TypeLhs, TypeRhs, NULL_EQUALS>(lhs, rhs);
+
+  auto const result = cudf::experimental::binary_operation(
+    lhs, rhs, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
+TYPED_TEST(BinaryOperationCompiledTest_NullOps, NullMax_Vector_Vector)
+{
+  using TypeOut  = typename TestFixture::TypeOut;
+  using TypeLhs  = typename TestFixture::TypeLhs;
+  using TypeRhs  = typename TestFixture::TypeRhs;
+  using NULL_MAX = cudf::library::operation::NullMax<TypeOut, TypeLhs, TypeRhs>;
+
+  auto lhs            = lhs_random_column<TypeLhs>(col_size);
+  auto rhs            = rhs_random_column<TypeRhs>(col_size);
+  auto const expected = NullOp_Result<TypeOut, TypeLhs, TypeRhs, NULL_MAX>(lhs, rhs);
+
+  auto const result = cudf::experimental::binary_operation(
+    lhs, rhs, cudf::binary_operator::NULL_MAX, data_type(type_to_id<TypeOut>()));
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
+TYPED_TEST(BinaryOperationCompiledTest_NullOps, NullMin_Vector_Vector)
+{
+  using TypeOut  = typename TestFixture::TypeOut;
+  using TypeLhs  = typename TestFixture::TypeLhs;
+  using TypeRhs  = typename TestFixture::TypeRhs;
+  using NULL_MIN = cudf::library::operation::NullMin<TypeOut, TypeLhs, TypeRhs>;
+
+  auto lhs            = lhs_random_column<TypeLhs>(col_size);
+  auto rhs            = rhs_random_column<TypeRhs>(col_size);
+  auto const expected = NullOp_Result<TypeOut, TypeLhs, TypeRhs, NULL_MIN>(lhs, rhs);
+
+  auto const result = cudf::experimental::binary_operation(
+    lhs, rhs, cudf::binary_operator::NULL_MIN, data_type(type_to_id<TypeOut>()));
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
+}  // namespace cudf::test::binop
diff --git a/cpp/tests/binaryop/binop-fixture.hpp b/cpp/tests/binaryop/binop-fixture.hpp
index c5ef3727e4e..25e1de2d368 100644
--- a/cpp/tests/binaryop/binop-fixture.hpp
+++ b/cpp/tests/binaryop/binop-fixture.hpp
@@ -23,6 +23,8 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <string>
+#include <type_traits>
 
 namespace cudf {
 namespace test {
@@ -35,13 +37,13 @@ struct BinaryOperationTest : public cudf::test::BaseFixture {
   static constexpr int r_max = 10;
 
   template <typename T>
-  auto make_data_iter(cudf::test::UniformRandomGenerator<T>& rand_gen)
+  static auto make_data_iter(cudf::test::UniformRandomGenerator<T>& rand_gen)
   {
     return cudf::detail::make_counting_transform_iterator(
       0, [&](auto row) { return rand_gen.generate(); });
   }
 
-  auto make_validity_iter()
+  static auto make_validity_iter()
   {
     cudf::test::UniformRandomGenerator<uint8_t> rand_gen(r_min, r_max);
     uint8_t mod_base = rand_gen.generate();
@@ -50,7 +52,7 @@ struct BinaryOperationTest : public cudf::test::BaseFixture {
   }
 
   template <typename T>
-  auto make_random_wrapped_column(size_type size)
+  static auto make_random_wrapped_column(size_type size)
   {
     cudf::test::UniformRandomGenerator<T> rand_gen(r_min, r_max);
     auto data_iter     = make_data_iter(rand_gen);
@@ -59,12 +61,21 @@ struct BinaryOperationTest : public cudf::test::BaseFixture {
     return cudf::test::fixed_width_column_wrapper<T>(data_iter, data_iter + size, validity_iter);
   }
 
-  template <typename T>
+  template <typename T, typename std::enable_if_t<!std::is_same_v<T, std::string>>* = nullptr>
   auto make_random_wrapped_scalar()
   {
     cudf::test::UniformRandomGenerator<T> rand_gen(r_min, r_max);
     return cudf::scalar_type_t<T>(rand_gen.generate());
   }
+
+  template <typename T, typename std::enable_if_t<std::is_same_v<T, std::string>>* = nullptr>
+  auto make_random_wrapped_scalar()
+  {
+    cudf::test::UniformRandomGenerator<uint8_t> rand_gen(r_min, r_max);
+    uint8_t size = rand_gen.generate();
+    std::string str{"ஔⒶbc⁂∰ൠ \tنж水✉♪✿™"};
+    return cudf::scalar_type_t<T>(string_view(str.data(), size));
+  }
 };
 
 }  // namespace binop
diff --git a/cpp/tests/binaryop/util/operation.h b/cpp/tests/binaryop/util/operation.h
index 8ee6475ee10..481e5cfd4a9 100644
--- a/cpp/tests/binaryop/util/operation.h
+++ b/cpp/tests/binaryop/util/operation.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Copyright 2018-2019 BlazingDB, Inc.
  *     Copyright 2018 Christian Noboa Mardini <christian@blazingdb.com>
@@ -209,6 +209,11 @@ struct Equal {
   TypeOut operator()(TypeLhs x, TypeRhs y) { return (x == y); }
 };
 
+template <typename TypeOut, typename TypeLhs, typename TypeRhs>
+struct NotEqual {
+  TypeOut operator()(TypeLhs x, TypeRhs y) { return (x != y); }
+};
+
 template <typename TypeOut, typename TypeLhs, typename TypeRhs>
 struct Less {
   TypeOut operator()(TypeLhs x, TypeRhs y) { return (x < y); }
@@ -287,8 +292,8 @@ struct PMod {
 
   TypeOut operator()(TypeLhs x, TypeRhs y) const
   {
-    CommonArgsT xconv{x};
-    CommonArgsT yconv{y};
+    CommonArgsT xconv{static_cast<CommonArgsT>(x)};
+    CommonArgsT yconv{static_cast<CommonArgsT>(y)};
     auto rem = std::fmod(xconv, yconv);
     if (rem < 0) rem = std::fmod(rem + yconv, yconv);
     return static_cast<TypeOut>(rem);
@@ -303,6 +308,61 @@ struct ATan2 {
   }
 };
 
+template <typename TypeOut, typename TypeLhs, typename TypeRhs>
+struct PyMod {
+  TypeOut operator()(TypeLhs x, TypeRhs y) const
+  {
+    if constexpr (std::is_floating_point_v<TypeLhs> or std::is_floating_point_v<TypeRhs>) {
+      double x1 = static_cast<double>(x);
+      double y1 = static_cast<double>(y);
+      return fmod(fmod(x1, y1) + y1, y1);
+    } else {
+      return ((x % y) + y) % y;
+    }
+    return {};
+  }
+};
+
+template <typename TypeOut, typename TypeLhs, typename TypeRhs>
+struct NullEquals {
+  TypeOut operator()(TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid, bool& output_valid) const
+  {
+    output_valid = true;
+    if (!lhs_valid && !rhs_valid) return true;
+    using common_t = std::common_type_t<TypeLhs, TypeRhs>;
+    if (lhs_valid && rhs_valid) return static_cast<common_t>(x) == static_cast<common_t>(y);
+    return false;
+  }
+};
+
+template <typename TypeOut, typename TypeLhs, typename TypeRhs>
+struct NullMax {
+  TypeOut operator()(TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid, bool& output_valid) const
+  {
+    output_valid = lhs_valid or rhs_valid;
+    if (lhs_valid or rhs_valid) {
+      return (lhs_valid and (!rhs_valid or static_cast<TypeOut>(x) > static_cast<TypeOut>(y)))
+               ? static_cast<TypeOut>(x)
+               : static_cast<TypeOut>(y);
+    } else
+      return TypeOut{};
+  }
+};
+
+template <typename TypeOut, typename TypeLhs, typename TypeRhs>
+struct NullMin {
+  TypeOut operator()(TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid, bool& output_valid) const
+  {
+    output_valid = lhs_valid or rhs_valid;
+    if (lhs_valid or rhs_valid) {
+      return (lhs_valid and (!rhs_valid or static_cast<TypeOut>(x) < static_cast<TypeOut>(y)))
+               ? static_cast<TypeOut>(x)
+               : static_cast<TypeOut>(y);
+    } else
+      return TypeOut{};
+  }
+};
+
 }  // namespace operation
 }  // namespace library
 }  // namespace cudf
diff --git a/cpp/tests/bitmask/bitmask_tests.cpp b/cpp/tests/bitmask/bitmask_tests.cpp
index aca93c17b9f..d82ff7f2ac4 100644
--- a/cpp/tests/bitmask/bitmask_tests.cpp
+++ b/cpp/tests/bitmask/bitmask_tests.cpp
@@ -270,7 +270,9 @@ TEST_F(CountUnsetBitsTest, NullMask)
   std::vector<cudf::size_type> indices = {0, 32, 7, 25};
   auto counts                          = cudf::segmented_count_unset_bits(nullptr, indices);
   EXPECT_EQ(indices.size(), counts.size() * 2);
-  for (size_t i = 0; i < counts.size(); i++) { EXPECT_EQ(0, counts[i]); }
+  for (size_t i = 0; i < counts.size(); i++) {
+    EXPECT_EQ(0, counts[i]);
+  }
 }
 
 TEST_F(CountUnsetBitsTest, SingleWordAllBits)
@@ -377,9 +379,9 @@ struct CopyBitmaskTest : public cudf::test::BaseFixture, cudf::test::UniformRand
   CopyBitmaskTest() : cudf::test::UniformRandomGenerator<int>{0, 1} {}
 };
 
-void cleanEndWord(rmm::device_buffer &mask, int begin_bit, int end_bit)
+void cleanEndWord(rmm::device_buffer& mask, int begin_bit, int end_bit)
 {
-  auto ptr = static_cast<cudf::bitmask_type *>(mask.data());
+  auto ptr = static_cast<cudf::bitmask_type*>(mask.data());
 
   auto number_of_mask_words = cudf::num_bitmask_words(static_cast<size_t>(end_bit - begin_bit));
   auto number_of_bits       = end_bit - begin_bit;
@@ -421,7 +423,9 @@ TEST_F(CopyBitmaskTest, NullPtr)
 TEST_F(CopyBitmaskTest, TestZeroOffset)
 {
   std::vector<int> validity_bit(1000);
-  for (auto &m : validity_bit) { m = this->generate(); }
+  for (auto& m : validity_bit) {
+    m = this->generate();
+  }
   auto input_mask = cudf::test::detail::make_null_mask(validity_bit.begin(), validity_bit.end());
 
   int begin_bit         = 0;
@@ -430,7 +434,7 @@ TEST_F(CopyBitmaskTest, TestZeroOffset)
                                                              validity_bit.begin() + end_bit);
 
   auto splice_mask = cudf::copy_bitmask(
-    static_cast<const cudf::bitmask_type *>(input_mask.data()), begin_bit, end_bit);
+    static_cast<const cudf::bitmask_type*>(input_mask.data()), begin_bit, end_bit);
 
   cleanEndWord(splice_mask, begin_bit, end_bit);
   auto number_of_bits = end_bit - begin_bit;
@@ -441,7 +445,9 @@ TEST_F(CopyBitmaskTest, TestZeroOffset)
 TEST_F(CopyBitmaskTest, TestNonZeroOffset)
 {
   std::vector<int> validity_bit(1000);
-  for (auto &m : validity_bit) { m = this->generate(); }
+  for (auto& m : validity_bit) {
+    m = this->generate();
+  }
   auto input_mask = cudf::test::detail::make_null_mask(validity_bit.begin(), validity_bit.end());
 
   int begin_bit         = 321;
@@ -450,7 +456,7 @@ TEST_F(CopyBitmaskTest, TestNonZeroOffset)
                                                              validity_bit.begin() + end_bit);
 
   auto splice_mask = cudf::copy_bitmask(
-    static_cast<const cudf::bitmask_type *>(input_mask.data()), begin_bit, end_bit);
+    static_cast<const cudf::bitmask_type*>(input_mask.data()), begin_bit, end_bit);
 
   cleanEndWord(splice_mask, begin_bit, end_bit);
   auto number_of_bits = end_bit - begin_bit;
@@ -463,7 +469,9 @@ TEST_F(CopyBitmaskTest, TestCopyColumnViewVectorContiguous)
   cudf::data_type t{cudf::type_id::INT32};
   cudf::size_type num_elements = 1001;
   std::vector<int> validity_bit(num_elements);
-  for (auto &m : validity_bit) { m = this->generate(); }
+  for (auto& m : validity_bit) {
+    m = this->generate();
+  }
   auto gold_mask = cudf::test::detail::make_null_mask(validity_bit.begin(), validity_bit.end());
 
   rmm::device_buffer copy_mask{gold_mask, rmm::cuda_stream_default};
@@ -501,7 +509,9 @@ TEST_F(CopyBitmaskTest, TestCopyColumnViewVectorDiscontiguous)
   cudf::data_type t{cudf::type_id::INT32};
   cudf::size_type num_elements = 1001;
   std::vector<int> validity_bit(num_elements);
-  for (auto &m : validity_bit) { m = this->generate(); }
+  for (auto& m : validity_bit) {
+    m = this->generate();
+  }
   auto gold_mask = cudf::test::detail::make_null_mask(validity_bit.begin(), validity_bit.end());
   std::vector<cudf::size_type> split{0, 104, 128, 152, 311, 491, 583, 734, 760, num_elements};
 
diff --git a/cpp/tests/bitmask/set_nullmask_tests.cu b/cpp/tests/bitmask/set_nullmask_tests.cu
index 235aec7ddf8..91f72c8de5f 100644
--- a/cpp/tests/bitmask/set_nullmask_tests.cu
+++ b/cpp/tests/bitmask/set_nullmask_tests.cu
@@ -38,7 +38,8 @@ struct valid_bit_functor {
 
 std::ostream& operator<<(std::ostream& stream, thrust::host_vector<bool> const& bits)
 {
-  for (auto _bit : bits) stream << int(_bit);
+  for (auto _bit : bits)
+    stream << int(_bit);
   return stream;
 }
 
diff --git a/cpp/tests/column/factories_test.cpp b/cpp/tests/column/factories_test.cpp
index 2c7984b5f79..bdaa20f63bb 100644
--- a/cpp/tests/column/factories_test.cpp
+++ b/cpp/tests/column/factories_test.cpp
@@ -530,7 +530,7 @@ TYPED_TEST(ListsDictionaryLeafTest, FromNested)
   DCW leaf({1, 3, -1, 1, 3, 1, 3, -1, 1, 3}, {1, 1, 0, 1, 1, 1, 1, 0, 1, 1});
   offset_t offsets{0, 3, 3, 6, 6, 10};
   auto mask = cudf::create_null_mask(5, cudf::mask_state::ALL_VALID);
-  cudf::set_null_mask(static_cast<cudf::bitmask_type *>(mask.data()), 1, 2, false);
+  cudf::set_null_mask(static_cast<cudf::bitmask_type*>(mask.data()), 1, 2, false);
   auto data = cudf::make_lists_column(5, offsets.release(), leaf.release(), 0, std::move(mask));
 
   auto s   = cudf::make_list_scalar(*data);
@@ -542,9 +542,9 @@ TYPED_TEST(ListsDictionaryLeafTest, FromNested)
     {1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1});
   offset_t offsets2{0, 3, 3, 6, 6, 10, 13, 13, 16, 16, 20, 23, 23, 26, 26, 30};
   auto mask2 = cudf::create_null_mask(15, cudf::mask_state::ALL_VALID);
-  cudf::set_null_mask(static_cast<cudf::bitmask_type *>(mask2.data()), 1, 2, false);
-  cudf::set_null_mask(static_cast<cudf::bitmask_type *>(mask2.data()), 6, 7, false);
-  cudf::set_null_mask(static_cast<cudf::bitmask_type *>(mask2.data()), 11, 12, false);
+  cudf::set_null_mask(static_cast<cudf::bitmask_type*>(mask2.data()), 1, 2, false);
+  cudf::set_null_mask(static_cast<cudf::bitmask_type*>(mask2.data()), 6, 7, false);
+  cudf::set_null_mask(static_cast<cudf::bitmask_type*>(mask2.data()), 11, 12, false);
   auto nested =
     cudf::make_lists_column(15, offsets2.release(), leaf2.release(), 3, std::move(mask2));
 
@@ -658,7 +658,7 @@ TYPED_TEST(ListsStructsLeafTest, FromNested)
     LCWinner_t({LCWinner_t{}, LCWinner_t{42}}, valid_t{1, 1}.begin()),
     valid_t{0, 1}.begin());
   auto mask = cudf::create_null_mask(3, cudf::mask_state::ALL_VALID);
-  cudf::set_null_mask(static_cast<cudf::bitmask_type *>(mask.data()), 0, 1, false);
+  cudf::set_null_mask(static_cast<cudf::bitmask_type*>(mask.data()), 0, 1, false);
   auto data =
     cudf::make_lists_column(3, offset_t{0, 0, 1, 2}.release(), leaf.release(), 1, std::move(mask));
   auto s = cudf::make_list_scalar(*data);
@@ -674,9 +674,9 @@ TYPED_TEST(ListsStructsLeafTest, FromNested)
       valid_t{1, 1, 1, 1, 1, 1}.begin()),
     valid_t{0, 1, 0, 1, 0, 1}.begin());
   auto mask2 = cudf::create_null_mask(9, cudf::mask_state::ALL_VALID);
-  cudf::set_null_mask(static_cast<cudf::bitmask_type *>(mask2.data()), 0, 1, false);
-  cudf::set_null_mask(static_cast<cudf::bitmask_type *>(mask2.data()), 3, 4, false);
-  cudf::set_null_mask(static_cast<cudf::bitmask_type *>(mask2.data()), 6, 7, false);
+  cudf::set_null_mask(static_cast<cudf::bitmask_type*>(mask2.data()), 0, 1, false);
+  cudf::set_null_mask(static_cast<cudf::bitmask_type*>(mask2.data()), 3, 4, false);
+  cudf::set_null_mask(static_cast<cudf::bitmask_type*>(mask2.data()), 6, 7, false);
   auto data2 = cudf::make_lists_column(
     9, offset_t{0, 0, 1, 2, 2, 3, 4, 4, 5, 6}.release(), leaf2.release(), 3, std::move(mask2));
   auto expected = cudf::make_lists_column(3,
diff --git a/cpp/tests/copying/get_value_tests.cpp b/cpp/tests/copying/get_value_tests.cpp
index 303f0a00658..c07db17ec15 100644
--- a/cpp/tests/copying/get_value_tests.cpp
+++ b/cpp/tests/copying/get_value_tests.cpp
@@ -49,7 +49,7 @@ TYPED_TEST(FixedWidthGetValueTest, BasicGet)
   auto s = get_element(col, 0);
 
   using ScalarType = scalar_type_t<TypeParam>;
-  auto typed_s     = static_cast<ScalarType const *>(s.get());
+  auto typed_s     = static_cast<ScalarType const*>(s.get());
 
   EXPECT_TRUE(s->is_valid());
   EXPECT_EQ(cudf::test::make_type_param_scalar<TypeParam>(9), typed_s->value());
@@ -61,7 +61,7 @@ TYPED_TEST(FixedWidthGetValueTest, GetFromNullable)
   auto s = get_element(col, 1);
 
   using ScalarType = scalar_type_t<TypeParam>;
-  auto typed_s     = static_cast<ScalarType const *>(s.get());
+  auto typed_s     = static_cast<ScalarType const*>(s.get());
 
   EXPECT_TRUE(s->is_valid());
   EXPECT_EQ(cudf::test::make_type_param_scalar<TypeParam>(8), typed_s->value());
@@ -91,7 +91,7 @@ TEST_F(StringGetValueTest, BasicGet)
   strings_column_wrapper col{"this", "is", "a", "test"};
   auto s = get_element(col, 3);
 
-  auto typed_s = static_cast<string_scalar const *>(s.get());
+  auto typed_s = static_cast<string_scalar const*>(s.get());
 
   EXPECT_TRUE(s->is_valid());
   EXPECT_EQ("test", typed_s->to_string());
@@ -102,7 +102,7 @@ TEST_F(StringGetValueTest, GetEmpty)
   strings_column_wrapper col{"this", "is", "", "test"};
   auto s = get_element(col, 2);
 
-  auto typed_s = static_cast<string_scalar const *>(s.get());
+  auto typed_s = static_cast<string_scalar const*>(s.get());
 
   EXPECT_TRUE(s->is_valid());
   EXPECT_EQ("", typed_s->to_string());
@@ -113,7 +113,7 @@ TEST_F(StringGetValueTest, GetFromNullable)
   strings_column_wrapper col({"this", "is", "a", "test"}, {0, 1, 0, 1});
   auto s = get_element(col, 1);
 
-  auto typed_s = static_cast<string_scalar const *>(s.get());
+  auto typed_s = static_cast<string_scalar const*>(s.get());
 
   EXPECT_TRUE(s->is_valid());
   EXPECT_EQ("is", typed_s->to_string());
@@ -142,7 +142,7 @@ TYPED_TEST(DictionaryGetValueTest, BasicGet)
   auto s = get_element(*col, 2);
 
   using ScalarType = scalar_type_t<TypeParam>;
-  auto typed_s     = static_cast<ScalarType const *>(s.get());
+  auto typed_s     = static_cast<ScalarType const*>(s.get());
 
   EXPECT_TRUE(s->is_valid());
   EXPECT_EQ(cudf::test::make_type_param_scalar<TypeParam>(7), typed_s->value());
@@ -157,7 +157,7 @@ TYPED_TEST(DictionaryGetValueTest, GetFromNullable)
   auto s = get_element(*col, 3);
 
   using ScalarType = scalar_type_t<TypeParam>;
-  auto typed_s     = static_cast<ScalarType const *>(s.get());
+  auto typed_s     = static_cast<ScalarType const*>(s.get());
 
   EXPECT_TRUE(s->is_valid());
   EXPECT_EQ(cudf::test::make_type_param_scalar<TypeParam>(8), typed_s->value());
@@ -204,7 +204,7 @@ TYPED_TEST(ListGetFixedWidthValueTest, NonNestedGetNonNullNonEmpty)
   size_type index = 0;
 
   auto s       = get_element(col, index);
-  auto typed_s = static_cast<list_scalar const *>(s.get());
+  auto typed_s = static_cast<list_scalar const*>(s.get());
 
   EXPECT_TRUE(s->is_valid());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_data, typed_s->view());
@@ -219,7 +219,7 @@ TYPED_TEST(ListGetFixedWidthValueTest, NonNestedGetNonNullEmpty)
   size_type index = 1;
 
   auto s       = get_element(col, index);
-  auto typed_s = static_cast<list_scalar const *>(s.get());
+  auto typed_s = static_cast<list_scalar const*>(s.get());
 
   EXPECT_TRUE(s->is_valid());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_data, typed_s->view());
@@ -234,7 +234,7 @@ TYPED_TEST(ListGetFixedWidthValueTest, NonNestedGetNull)
   size_type index = 2;
 
   auto s       = get_element(col, index);
-  auto typed_s = static_cast<list_scalar const *>(s.get());
+  auto typed_s = static_cast<list_scalar const*>(s.get());
 
   EXPECT_FALSE(s->is_valid());
   // Test preserve column hierarchy
@@ -258,7 +258,7 @@ TYPED_TEST(ListGetFixedWidthValueTest, NestedGetNonNullNonEmpty)
   size_type index = 3;
 
   auto s       = get_element(col, index);
-  auto typed_s = static_cast<list_scalar const *>(s.get());
+  auto typed_s = static_cast<list_scalar const*>(s.get());
 
   EXPECT_TRUE(s->is_valid());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_data, typed_s->view());
@@ -281,7 +281,7 @@ TYPED_TEST(ListGetFixedWidthValueTest, NestedGetNonNullNonEmptyPreserveNull)
   size_type index = 3;
 
   auto s       = get_element(col, index);
-  auto typed_s = static_cast<list_scalar const *>(s.get());
+  auto typed_s = static_cast<list_scalar const*>(s.get());
 
   EXPECT_TRUE(s->is_valid());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_data, typed_s->view());
@@ -303,7 +303,7 @@ TYPED_TEST(ListGetFixedWidthValueTest, NestedGetNonNullEmpty)
   size_type index = 1;
 
   auto s       = get_element(col, index);
-  auto typed_s = static_cast<list_scalar const *>(s.get());
+  auto typed_s = static_cast<list_scalar const*>(s.get());
 
   EXPECT_TRUE(s->is_valid());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_data, typed_s->view());
@@ -328,7 +328,7 @@ TYPED_TEST(ListGetFixedWidthValueTest, NestedGetNull)
   size_type index = 1;
 
   auto s       = get_element(col, index);
-  auto typed_s = static_cast<list_scalar const *>(s.get());
+  auto typed_s = static_cast<list_scalar const*>(s.get());
 
   auto expected_data =
     make_lists_column(0, offset_t{}.release(), FCW{}.release(), 0, rmm::device_buffer{});
@@ -358,7 +358,7 @@ TEST_F(ListGetStringValueTest, NonNestedGetNonNullNonEmpty)
   size_type index = 0;
 
   auto s       = get_element(col, index);
-  auto typed_s = static_cast<list_scalar const *>(s.get());
+  auto typed_s = static_cast<list_scalar const*>(s.get());
 
   EXPECT_TRUE(s->is_valid());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_data, typed_s->view());
@@ -373,7 +373,7 @@ TEST_F(ListGetStringValueTest, NonNestedGetNonNullEmpty)
   size_type index = 1;
 
   auto s       = get_element(col, index);
-  auto typed_s = static_cast<list_scalar const *>(s.get());
+  auto typed_s = static_cast<list_scalar const*>(s.get());
 
   EXPECT_TRUE(s->is_valid());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_data, typed_s->view());
@@ -389,7 +389,7 @@ TEST_F(ListGetStringValueTest, NonNestedGetNull)
   size_type index = 2;
 
   auto s       = get_element(col, index);
-  auto typed_s = static_cast<list_scalar const *>(s.get());
+  auto typed_s = static_cast<list_scalar const*>(s.get());
 
   EXPECT_FALSE(s->is_valid());
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(typed_s->view(), StringCW{});
@@ -411,7 +411,7 @@ TEST_F(ListGetStringValueTest, NestedGetNonNullNonEmpty)
   size_type index = 2;
 
   auto s       = get_element(col, index);
-  auto typed_s = static_cast<list_scalar const *>(s.get());
+  auto typed_s = static_cast<list_scalar const*>(s.get());
 
   EXPECT_TRUE(s->is_valid());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_data, typed_s->view());
@@ -435,7 +435,7 @@ TEST_F(ListGetStringValueTest, NestedGetNonNullNonEmptyPreserveNull)
   size_type index = 2;
 
   auto s       = get_element(col, index);
-  auto typed_s = static_cast<list_scalar const *>(s.get());
+  auto typed_s = static_cast<list_scalar const*>(s.get());
 
   EXPECT_TRUE(s->is_valid());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_data, typed_s->view());
@@ -457,7 +457,7 @@ TEST_F(ListGetStringValueTest, NestedGetNonNullEmpty)
   size_type index = 3;
 
   auto s       = get_element(col, index);
-  auto typed_s = static_cast<list_scalar const *>(s.get());
+  auto typed_s = static_cast<list_scalar const*>(s.get());
 
   EXPECT_TRUE(s->is_valid());
   // Relax to equivalent. `expected_data` leaf string column does not
@@ -484,7 +484,7 @@ TEST_F(ListGetStringValueTest, NestedGetNull)
   size_type index = 0;
 
   auto s       = get_element(col, index);
-  auto typed_s = static_cast<list_scalar const *>(s.get());
+  auto typed_s = static_cast<list_scalar const*>(s.get());
 
   auto expected_data =
     make_lists_column(0, offset_t{}.release(), StringCW{}.release(), 0, rmm::device_buffer{});
@@ -521,7 +521,7 @@ struct ListGetStructValueTest : public BaseFixture {
       std::for_each(
         thrust::make_counting_iterator(0), thrust::make_counting_iterator(num_lists), [&](auto i) {
           if (*(null_mask.begin() + i)) {
-            set_null_mask(static_cast<bitmask_type *>(d_null_mask.data()), i, i + 1, true);
+            set_null_mask(static_cast<bitmask_type*>(d_null_mask.data()), i, i + 1, true);
           }
         });
     }
@@ -553,7 +553,7 @@ struct ListGetStructValueTest : public BaseFixture {
   {
     std::vector<column_view> views;
     std::transform(
-      rows.begin(), rows.end(), std::back_inserter(views), [](auto &r) { return column_view(r); });
+      rows.begin(), rows.end(), std::back_inserter(views), [](auto& r) { return column_view(r); });
     return cudf::concatenate(views);
   }
 
@@ -616,7 +616,7 @@ TYPED_TEST(ListGetStructValueTest, NonNestedGetNonNullNonEmpty)
   auto expected_data = this->row2();
 
   auto s       = get_element(list_column->view(), index);
-  auto typed_s = static_cast<list_scalar const *>(s.get());
+  auto typed_s = static_cast<list_scalar const*>(s.get());
 
   EXPECT_TRUE(s->is_valid());
   // Relax to equivalent. The nested list column in struct allocates `null_mask`.
@@ -634,7 +634,7 @@ TYPED_TEST(ListGetStructValueTest, NonNestedGetNonNullNonEmpty2)
   auto expected_data = this->concat({this->row0(), this->row1()});
 
   auto s       = get_element(list_column->view(), index);
-  auto typed_s = static_cast<list_scalar const *>(s.get());
+  auto typed_s = static_cast<list_scalar const*>(s.get());
 
   EXPECT_TRUE(s->is_valid());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected_data, typed_s->view());
@@ -654,7 +654,7 @@ TYPED_TEST(ListGetStructValueTest, NonNestedGetNonNullEmpty)
   auto expected_data = this->zero_length_struct();
 
   auto s       = get_element(list_column->view(), index);
-  auto typed_s = static_cast<list_scalar const *>(s.get());
+  auto typed_s = static_cast<list_scalar const*>(s.get());
 
   EXPECT_TRUE(s->is_valid());
   // Relax to equivalent. The nested list column in struct allocates `null_mask`.
@@ -673,7 +673,7 @@ TYPED_TEST(ListGetStructValueTest, NonNestedGetNull)
   size_type index  = 0;
 
   auto s       = get_element(list_column->view(), index);
-  auto typed_s = static_cast<list_scalar const *>(s.get());
+  auto typed_s = static_cast<list_scalar const*>(s.get());
 
   auto expected_data = this->make_test_structs_column({}, {}, {}, valid_t{}.begin());
 
@@ -695,7 +695,7 @@ TYPED_TEST(ListGetStructValueTest, NestedGetNonNullNonEmpty)
 
   size_type index = 0;
   auto s          = get_element(list_column_nested->view(), index);
-  auto typed_s    = static_cast<list_scalar const *>(s.get());
+  auto typed_s    = static_cast<list_scalar const*>(s.get());
 
   EXPECT_TRUE(s->is_valid());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected_data, typed_s->view());
@@ -716,7 +716,7 @@ TYPED_TEST(ListGetStructValueTest, NestedGetNonNullNonEmpty2)
 
   size_type index = 0;
   auto s          = get_element(list_column_nested->view(), index);
-  auto typed_s    = static_cast<list_scalar const *>(s.get());
+  auto typed_s    = static_cast<list_scalar const*>(s.get());
 
   EXPECT_TRUE(s->is_valid());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected_data, typed_s->view());
@@ -736,7 +736,7 @@ TYPED_TEST(ListGetStructValueTest, NestedGetNonNullNonEmpty3)
 
   size_type index = 1;
   auto s          = get_element(list_column_nested->view(), index);
-  auto typed_s    = static_cast<list_scalar const *>(s.get());
+  auto typed_s    = static_cast<list_scalar const*>(s.get());
 
   EXPECT_TRUE(s->is_valid());
   // Relax to equivalent. For `get_element`, the nested list column in struct
@@ -760,7 +760,7 @@ TYPED_TEST(ListGetStructValueTest, NestedGetNonNullEmpty)
 
   size_type index = 1;
   auto s          = get_element(list_column_nested->view(), index);
-  auto typed_s    = static_cast<list_scalar const *>(s.get());
+  auto typed_s    = static_cast<list_scalar const*>(s.get());
 
   EXPECT_TRUE(s->is_valid());
   // Relax to equivalent. The sliced version still has the array for fields
@@ -784,7 +784,7 @@ TYPED_TEST(ListGetStructValueTest, NestedGetNull)
 
   size_type index = 2;
   auto s          = get_element(list_column_nested->view(), index);
-  auto typed_s    = static_cast<list_scalar const *>(s.get());
+  auto typed_s    = static_cast<list_scalar const*>(s.get());
 
   auto nested = this->make_test_structs_column({}, {}, {}, valid_t{}.begin());
   auto expected_data =
@@ -816,7 +816,7 @@ TYPED_TEST(StructGetValueTestTyped, mixed_types_valid)
 
   size_type index = 2;
   auto s          = get_element(col, index);
-  auto typed_s    = static_cast<struct_scalar const *>(s.get());
+  auto typed_s    = static_cast<struct_scalar const*>(s.get());
 
   // expect fields
   fixed_width_column_wrapper<TypeParam> ef1{3};
@@ -846,7 +846,7 @@ TYPED_TEST(StructGetValueTestTyped, mixed_types_valid_with_nulls)
 
   size_type index = 1;
   auto s          = get_element(col, index);
-  auto typed_s    = static_cast<struct_scalar const *>(s.get());
+  auto typed_s    = static_cast<struct_scalar const*>(s.get());
 
   // expect fields
   fixed_width_column_wrapper<TypeParam> ef1({-1}, {false});
@@ -880,7 +880,7 @@ TYPED_TEST(StructGetValueTestTyped, mixed_types_invalid)
 
   size_type index = 0;
   auto s          = get_element(col, index);
-  auto typed_s    = static_cast<struct_scalar const *>(s.get());
+  auto typed_s    = static_cast<struct_scalar const*>(s.get());
 
   EXPECT_FALSE(typed_s->is_valid());
 
@@ -912,7 +912,7 @@ TEST_F(StructGetValueTest, multi_level_nested)
 
   size_type index = 0;
   auto s          = get_element(l0, index);
-  auto typed_s    = static_cast<struct_scalar const *>(s.get());
+  auto typed_s    = static_cast<struct_scalar const*>(s.get());
 
   // Expect fields
   column_view cv = column_view(l0);
diff --git a/cpp/tests/copying/pack_tests.cpp b/cpp/tests/copying/pack_tests.cpp
index 367c6a3ae53..11aa505d163 100644
--- a/cpp/tests/copying/pack_tests.cpp
+++ b/cpp/tests/copying/pack_tests.cpp
@@ -173,7 +173,7 @@ std::vector<std::unique_ptr<column>> generate_structs(bool include_validity)
   std::vector<bool> ages_validity = {1, 1, 1, 1, 0, 1, 0, 0, 1};
   auto ages_column                = include_validity ? fixed_width_column_wrapper<int>(
                                           ages.begin(), ages.end(), ages_validity.begin())
-                                      : fixed_width_column_wrapper<int>(ages.begin(), ages.end());
+                                                     : fixed_width_column_wrapper<int>(ages.begin(), ages.end());
 
   // 3. Boolean "is_human" column.
   std::vector<bool> is_human{true, true, false, false, false, false, true, true, true};
@@ -472,6 +472,24 @@ TEST_F(PackUnpackTest, NestedSliced)
   this->run_test(t);
 }
 
+TEST_F(PackUnpackTest, EmptyTable)
+{
+  // no columns
+  {
+    cudf::table_view t;
+    this->run_test(t);
+  }
+
+  // no rows
+  {
+    cudf::test::fixed_width_column_wrapper<int> a;
+    cudf::test::strings_column_wrapper b;
+    cudf::test::lists_column_wrapper<float> c;
+    cudf::table_view t({a, b, c});
+    this->run_test(t);
+  }
+}
+
 // clang-format on
 
 }  // namespace test
diff --git a/cpp/tests/copying/scatter_struct_scalar_tests.cpp b/cpp/tests/copying/scatter_struct_scalar_tests.cpp
new file mode 100644
index 00000000000..c0594b4fd9a
--- /dev/null
+++ b/cpp/tests/copying/scatter_struct_scalar_tests.cpp
@@ -0,0 +1,270 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/column/column_factories.hpp>
+#include <cudf/copying.hpp>
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/scalar/scalar_factories.hpp>
+
+namespace cudf {
+namespace test {
+
+constexpr bool print_all{true};  // For debugging
+constexpr int32_t null{0};       // Mark for null child elements
+constexpr int32_t XXX{0};        // Mark for null struct elements
+
+using structs_col = cudf::test::structs_column_wrapper;
+
+template <typename T>
+struct TypedStructScalarScatterTest : public cudf::test::BaseFixture {
+};
+
+TYPED_TEST_CASE(TypedStructScalarScatterTest, FixedWidthTypes);
+
+column scatter_single_scalar(scalar const& slr, column_view scatter_map, column_view target)
+{
+  auto result = scatter({slr}, scatter_map, table_view{{target}}, false);
+  return result->get_column(0);
+}
+
+TYPED_TEST(TypedStructScalarScatterTest, Basic)
+{
+  using fixed_width_wrapper = fixed_width_column_wrapper<TypeParam>;
+
+  // Source scalar
+  fixed_width_wrapper slr_f0{777};
+  strings_column_wrapper slr_f1{"hello"};
+  auto slr = make_struct_scalar(table_view{{slr_f0, slr_f1}});
+
+  // Scatter map
+  fixed_width_column_wrapper<size_type> scatter_map{2};
+
+  // Target column
+  fixed_width_wrapper field0{11, 11, 22, 22, 33};
+  strings_column_wrapper field1{"aa", "aa", "bb", "bb", "cc"};
+  structs_col target{field0, field1};
+
+  // Expect column
+  fixed_width_wrapper ef0{11, 11, 777, 22, 33};
+  strings_column_wrapper ef1{"aa", "aa", "hello", "bb", "cc"};
+  structs_col expected{ef0, ef1};
+
+  auto got = scatter_single_scalar(*slr, scatter_map, target);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, got, print_all);
+}
+
+TYPED_TEST(TypedStructScalarScatterTest, FillNulls)
+{
+  using fixed_width_wrapper = fixed_width_column_wrapper<TypeParam>;
+
+  // Source scalar
+  fixed_width_wrapper slr_f0{777};
+  auto slr = make_struct_scalar(table_view{{slr_f0}});
+
+  // Scatter map
+  fixed_width_column_wrapper<size_type> scatter_map{3, 4};
+
+  // Target column
+  fixed_width_wrapper field0({11, 11, 22, null, XXX}, iterators::null_at(3));
+  structs_col target({field0}, iterators::null_at(4));
+
+  // Expect column
+  fixed_width_wrapper ef0{11, 11, 22, 777, 777};
+  structs_col expected{ef0};
+
+  auto got = scatter_single_scalar(*slr, scatter_map, target);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, got, print_all);
+}
+
+TYPED_TEST(TypedStructScalarScatterTest, ScatterNullElements)
+{
+  using fixed_width_wrapper = fixed_width_column_wrapper<TypeParam>;
+
+  // Source scalar
+  fixed_width_wrapper slr_f0{777};
+  std::vector<column_view> source_fields{slr_f0};
+  auto slr = std::make_unique<struct_scalar>(source_fields, false);
+
+  // Scatter map
+  fixed_width_column_wrapper<size_type> scatter_map{0, 3, 4};
+
+  // Target column
+  fixed_width_wrapper field0({11, 11, 22, null, XXX}, iterators::null_at(3));
+  structs_col target({field0}, iterators::null_at(4));
+
+  // Expect column
+  fixed_width_wrapper ef0{XXX, 11, 22, XXX, XXX};
+  structs_col expected({ef0}, {false, true, true, false, false});
+
+  auto got = scatter_single_scalar(*slr, scatter_map, target);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, got, print_all);
+}
+
+TYPED_TEST(TypedStructScalarScatterTest, ScatterNullFields)
+{
+  using fixed_width_wrapper = fixed_width_column_wrapper<TypeParam>;
+
+  // Source scalar
+  fixed_width_wrapper slr_f0({null}, {false});
+  auto slr = make_struct_scalar(table_view{{slr_f0}});
+
+  // Scatter map
+  fixed_width_column_wrapper<size_type> scatter_map{0, 2};
+
+  // Target column
+  fixed_width_wrapper field0({11, 11, 22, null, XXX}, iterators::null_at(3));
+  structs_col target({field0}, iterators::null_at(4));
+
+  // Expect column
+  fixed_width_wrapper ef0({null, 11, null, null, XXX}, {false, true, false, false, true});
+  structs_col expected({ef0}, iterators::null_at(4));
+
+  auto got = scatter_single_scalar(*slr, scatter_map, target);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, got, print_all);
+}
+
+TYPED_TEST(TypedStructScalarScatterTest, NegativeIndices)
+{
+  using fixed_width_wrapper = fixed_width_column_wrapper<TypeParam>;
+
+  // Source scalar
+  fixed_width_wrapper slr_f0{777};
+  auto slr = make_struct_scalar(table_view{{slr_f0}});
+
+  // Scatter map
+  fixed_width_column_wrapper<size_type> scatter_map{-1, -5};
+
+  // Target column
+  fixed_width_wrapper field0({11, 11, 22, null, XXX}, iterators::null_at(3));
+  structs_col target({field0}, iterators::null_at(4));
+
+  // Expect column
+  fixed_width_wrapper ef0({777, 11, 22, null, 777}, iterators::null_at(3));
+  structs_col expected{ef0};
+
+  auto got = scatter_single_scalar(*slr, scatter_map, target);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, got, print_all);
+}
+
+TYPED_TEST(TypedStructScalarScatterTest, EmptyInputTest)
+{
+  using fixed_width_wrapper = fixed_width_column_wrapper<TypeParam>;
+
+  // Source scalar
+  fixed_width_wrapper slr_f0{777};
+  auto slr = make_struct_scalar(table_view{{slr_f0}});
+
+  // Scatter map
+  fixed_width_column_wrapper<size_type> scatter_map{};
+
+  // Target column
+  fixed_width_wrapper field0{};
+  structs_col target{field0};
+
+  // Expect column
+  fixed_width_wrapper ef0{};
+  structs_col expected{ef0};
+
+  auto got = scatter_single_scalar(*slr, scatter_map, target);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, got, print_all);
+}
+
+TYPED_TEST(TypedStructScalarScatterTest, EmptyScatterMapTest)
+{
+  using fixed_width_wrapper = fixed_width_column_wrapper<TypeParam>;
+
+  // Source scalar
+  fixed_width_wrapper slr_f0{777};
+  auto slr = make_struct_scalar(table_view{{slr_f0}});
+
+  // Scatter map
+  fixed_width_column_wrapper<size_type> scatter_map{};
+
+  // Target column
+  fixed_width_wrapper field0({11, 11, 22, null, XXX}, iterators::null_at(3));
+  structs_col target({field0}, iterators::null_at(4));
+
+  auto got = scatter_single_scalar(*slr, scatter_map, target);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(target, got, print_all);
+}
+
+TYPED_TEST(TypedStructScalarScatterTest, FixedWidthStringTypes)
+{
+  using fixed_width_wrapper = fixed_width_column_wrapper<TypeParam>;
+
+  // Source scalar
+  fixed_width_wrapper slr_f0{777};
+  strings_column_wrapper slr_f1{"hello"};
+  auto slr = make_struct_scalar(table_view{{slr_f0, slr_f1}});
+
+  // Scatter map
+  fixed_width_column_wrapper<size_type> scatter_map{0, 2, 4};
+
+  // Target column
+  fixed_width_wrapper field0({11, 11, 22, null, XXX}, iterators::null_at(3));
+  strings_column_wrapper field1({"aa", "null", "ccc", "null", "XXX"},
+                                {true, false, true, false, true});
+  structs_col target({field0, field1}, iterators::null_at(4));
+
+  // Expect column
+  fixed_width_wrapper ef0({777, 11, 777, null, 777}, iterators::null_at(3));
+  strings_column_wrapper ef1({"hello", "null", "hello", "null", "hello"},
+                             {true, false, true, false, true});
+  structs_col expected{ef0, ef1};
+
+  auto got = scatter_single_scalar(*slr, scatter_map, target);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, got, print_all);
+}
+
+TYPED_TEST(TypedStructScalarScatterTest, StructOfLists)
+{
+  using LCW = lists_column_wrapper<TypeParam, int32_t>;
+
+  // Source scalar
+  LCW slr_f0{777};
+  auto slr = make_struct_scalar(table_view{{slr_f0}});
+
+  // Scatter map
+  fixed_width_column_wrapper<size_type> scatter_map{0, 1, 4};
+
+  // Target column
+  LCW field0({LCW{XXX}, LCW{22}, LCW{33, 44}, LCW{null}, LCW{55}}, iterators::null_at(3));
+  structs_col target({field0}, iterators::null_at(0));
+
+  // Expect column
+  LCW ef0({LCW{777}, LCW{777}, LCW{33, 44}, LCW{null}, LCW{777}}, iterators::null_at(3));
+  structs_col expected{ef0};
+
+  auto got = scatter_single_scalar(*slr, scatter_map, target);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, got, print_all);
+}
+
+}  // namespace test
+}  // namespace cudf
diff --git a/cpp/tests/copying/shift_tests.cpp b/cpp/tests/copying/shift_tests.cpp
index f642ad5bd90..80dda1ab930 100644
--- a/cpp/tests/copying/shift_tests.cpp
+++ b/cpp/tests/copying/shift_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -201,3 +201,33 @@ TYPED_TEST(ShiftTest, MismatchFillValueDtypes)
 
   EXPECT_THROW(output = cudf::shift(input, 5, *fill), cudf::logic_error);
 }
+
+struct ShiftTestNonFixedWidth : public cudf::test::BaseFixture {
+};
+
+TEST_F(ShiftTestNonFixedWidth, StringsShiftTest)
+{
+  auto input =
+    cudf::test::strings_column_wrapper({"", "bb", "ccc", "ddddddé", ""}, {0, 1, 1, 1, 0});
+
+  auto fill    = cudf::string_scalar("xx");
+  auto results = cudf::shift(input, 2, fill);
+  auto expected_right =
+    cudf::test::strings_column_wrapper({"xx", "xx", "", "bb", "ccc"}, {1, 1, 0, 1, 1});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_right, *results);
+
+  results = cudf::shift(input, -2, fill);
+  auto expected_left =
+    cudf::test::strings_column_wrapper({"ccc", "ddddddé", "", "xx", "xx"}, {1, 1, 0, 1, 1});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_left, *results);
+
+  auto sliced = cudf::slice(input, {1, 4}).front();
+
+  results           = cudf::shift(sliced, 1, fill);
+  auto sliced_right = cudf::test::strings_column_wrapper({"xx", "bb", "ccc"});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(sliced_right, *results);
+
+  results          = cudf::shift(sliced, -1, fill);
+  auto sliced_left = cudf::test::strings_column_wrapper({"ccc", "ddddddé", "xx"});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(sliced_left, *results);
+}
diff --git a/cpp/tests/copying/split_tests.cpp b/cpp/tests/copying/split_tests.cpp
index 32a7ce1a038..47ffe497ce3 100644
--- a/cpp/tests/copying/split_tests.cpp
+++ b/cpp/tests/copying/split_tests.cpp
@@ -717,7 +717,9 @@ void split_null_input_strings_column_value(SplitFunc Split, CompareFunc Compare)
 
   auto expected = create_expected_string_tables_for_splits(strings, validity_masks, splits);
 
-  for (std::size_t i = 0; i < result.size(); ++i) { Compare(expected[i], result[i]); }
+  for (std::size_t i = 0; i < result.size(); ++i) {
+    Compare(expected[i], result[i]);
+  }
 }
 
 // split with strings
@@ -903,7 +905,7 @@ void split_structs(bool include_validity, SplitFunc Split, CompareFunc Compare)
   std::vector<bool> ages_validity = {1, 1, 1, 1, 0, 1, 0, 0, 1};
   auto ages_column                = include_validity ? fixed_width_column_wrapper<int>(
                                           ages.begin(), ages.end(), ages_validity.begin())
-                                      : fixed_width_column_wrapper<int>(ages.begin(), ages.end());
+                                                     : fixed_width_column_wrapper<int>(ages.begin(), ages.end());
 
   // 3. Boolean "is_human" column.
   std::vector<bool> is_human{true, true, false, false, false, false, true, true, true};
@@ -928,9 +930,9 @@ void split_structs(bool include_validity, SplitFunc Split, CompareFunc Compare)
   auto expected_names = include_validity
                           ? create_expected_string_columns_for_splits(names, splits, names_validity)
                           : create_expected_string_columns_for_splits(names, splits, false);
-  auto expected_ages = include_validity
-                         ? create_expected_columns_for_splits<int>(splits, ages, ages_validity)
-                         : create_expected_columns_for_splits<int>(splits, ages, false);
+  auto expected_ages  = include_validity
+                          ? create_expected_columns_for_splits<int>(splits, ages, ages_validity)
+                          : create_expected_columns_for_splits<int>(splits, ages, false);
   auto expected_is_human =
     include_validity ? create_expected_columns_for_splits<bool>(splits, is_human, is_human_validity)
                      : create_expected_columns_for_splits<bool>(splits, is_human, false);
diff --git a/cpp/tests/datetime/datetime_ops_test.cpp b/cpp/tests/datetime/datetime_ops_test.cpp
index 8aa83ce6b22..cdfc9de395c 100644
--- a/cpp/tests/datetime/datetime_ops_test.cpp
+++ b/cpp/tests/datetime/datetime_ops_test.cpp
@@ -26,6 +26,8 @@
 #include <cudf/types.hpp>
 #include <cudf/wrappers/timestamps.hpp>
 
+#define XXX false  // stub for null values
+
 template <typename T>
 struct NonTimestampTest : public cudf::test::BaseFixture {
   cudf::data_type type() { return cudf::data_type{cudf::type_to_id<T>()}; }
@@ -532,4 +534,37 @@ TEST_F(BasicDatetimeOpsTest, TestAddMonthsWithSecondsAndNullValues)
     true);
 }
 
+TEST_F(BasicDatetimeOpsTest, TestIsLeapYear)
+{
+  using namespace cudf::test;
+  using namespace cudf::datetime;
+  using namespace cuda::std::chrono;
+
+  // Time in seconds since epoch
+  // Dates converted using epochconverter.com
+  auto timestamps_s =
+    cudf::test::fixed_width_column_wrapper<cudf::timestamp_s, cudf::timestamp_s::rep>{
+      {
+        1594332839L,    // 2020-07-09 10:13:59 GMT - leap year
+        0L,             // null
+        915148800L,     // 1999-01-01 00:00:00 GMT - non leap year
+        -11663029161L,  // 1600-5-31 05:40:39 GMT - leap year
+        707904541L,     // 1992-06-07 08:09:01 GMT - leap year
+        2181048447L,    // 1900-11-20 09:12:33 GMT - non leap year
+        0L,             // UNIX EPOCH 1970-01-01 00:00:00 GMT - non leap year
+        -12212553600L,  // First full year of Gregorian Calandar 1583-01-01 00:00:00 - non-leap-year
+        0L,             // null
+        13591632822L,   // 2400-09-13 13:33:42 GMT - leap year
+        4539564243L,    // 2113-11-08 06:04:03 GMT - non leap year
+        0L              // null
+      },
+      {true, false, true, true, true, true, true, true, false, true, true, false}};
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *is_leap_year(timestamps_s),
+    cudf::test::fixed_width_column_wrapper<bool>{
+      {true, XXX, false, true, true, false, false, false, XXX, true, false, XXX},
+      {true, false, true, true, true, true, true, true, false, true, true, false}});
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/filling/repeat_tests.cpp b/cpp/tests/filling/repeat_tests.cpp
index b450015c0c8..730ef8c1d16 100644
--- a/cpp/tests/filling/repeat_tests.cpp
+++ b/cpp/tests/filling/repeat_tests.cpp
@@ -117,7 +117,9 @@ TYPED_TEST(RepeatTypedTestFixture, RepeatNullable)
   std::vector<int64_t> input_values(num_values);
   std::iota(input_values.begin(), input_values.end(), 0);
   std::vector<bool> input_valids(num_values);
-  for (size_t i{0}; i < input_valids.size(); i++) { input_valids[i] = (i % 2) == 0 ? true : false; }
+  for (size_t i{0}; i < input_valids.size(); i++) {
+    input_valids[i] = (i % 2) == 0 ? true : false;
+  }
 
   std::vector<cudf::size_type> counts(num_values);
   std::transform(counts.begin(), counts.end(), counts.begin(), [&](cudf::size_type count) {
diff --git a/cpp/tests/groupby/collect_set_tests.cpp b/cpp/tests/groupby/collect_set_tests.cpp
index 8ce0380ad66..2f89b04c745 100644
--- a/cpp/tests/groupby/collect_set_tests.cpp
+++ b/cpp/tests/groupby/collect_set_tests.cpp
@@ -25,11 +25,11 @@
 namespace cudf {
 namespace test {
 
-#define COL_K cudf::test::fixed_width_column_wrapper<int32_t, int32_t>
-#define COL_V cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>
-#define COL_S cudf::test::strings_column_wrapper
-#define LCL_V cudf::test::lists_column_wrapper<TypeParam, int32_t>
-#define LCL_S cudf::test::lists_column_wrapper<cudf::string_view>
+#define COL_K    cudf::test::fixed_width_column_wrapper<int32_t, int32_t>
+#define COL_V    cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>
+#define COL_S    cudf::test::strings_column_wrapper
+#define LCL_V    cudf::test::lists_column_wrapper<TypeParam, int32_t>
+#define LCL_S    cudf::test::lists_column_wrapper<cudf::string_view>
 #define VALIDITY std::initializer_list<bool>
 
 struct CollectSetTest : public cudf::test::BaseFixture {
diff --git a/cpp/tests/groupby/m2_tests.cpp b/cpp/tests/groupby/m2_tests.cpp
new file mode 100644
index 00000000000..7611dce2271
--- /dev/null
+++ b/cpp/tests/groupby/m2_tests.cpp
@@ -0,0 +1,243 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/aggregation.hpp>
+#include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/groupby.hpp>
+
+using namespace cudf::test::iterators;
+
+namespace {
+constexpr bool print_all{false};                                 // For debugging
+constexpr int32_t null{0};                                       // Mark for null elements
+constexpr double NaN{std::numeric_limits<double>::quiet_NaN()};  // Mark for NaN double elements
+
+template <class T>
+using keys_col = cudf::test::fixed_width_column_wrapper<T, int32_t>;
+
+template <class T>
+using vals_col = cudf::test::fixed_width_column_wrapper<T>;
+
+template <class T>
+using M2s_col = cudf::test::fixed_width_column_wrapper<T>;
+
+auto compute_M2(cudf::column_view const& keys, cudf::column_view const& values)
+{
+  std::vector<cudf::groupby::aggregation_request> requests;
+  requests.emplace_back(cudf::groupby::aggregation_request());
+  requests[0].values = values;
+  requests[0].aggregations.emplace_back(cudf::make_m2_aggregation());
+
+  auto gb_obj = cudf::groupby::groupby(cudf::table_view({keys}));
+  auto result = gb_obj.aggregate(requests);
+  return std::make_pair(std::move(result.first->release()[0]),
+                        std::move(result.second[0].results[0]));
+}
+}  // namespace
+
+template <class T>
+struct GroupbyM2TypedTest : public cudf::test::BaseFixture {
+};
+
+using TestTypes = cudf::test::Concat<cudf::test::Types<int8_t, int16_t, int32_t, int64_t>,
+                                     cudf::test::FloatingPointTypes>;
+TYPED_TEST_SUITE(GroupbyM2TypedTest, TestTypes);
+
+TYPED_TEST(GroupbyM2TypedTest, EmptyInput)
+{
+  using T = TypeParam;
+  using R = cudf::detail::target_type_t<T, cudf::aggregation::M2>;
+
+  auto const keys = keys_col<T>{};
+  auto const vals = vals_col<T>{};
+
+  auto const [out_keys, out_M2s] = compute_M2(keys, vals);
+  auto const expected_M2s        = M2s_col<R>{};
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(keys, *out_keys, print_all);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_M2s, *out_M2s, print_all);
+}
+
+TYPED_TEST(GroupbyM2TypedTest, AllNullKeysInput)
+{
+  using T = TypeParam;
+  using R = cudf::detail::target_type_t<T, cudf::aggregation::M2>;
+
+  auto const keys = keys_col<T>{{1, 2, 3}, all_nulls()};
+  auto const vals = vals_col<T>{3, 4, 5};
+
+  auto const [out_keys, out_M2s] = compute_M2(keys, vals);
+  auto const expected_keys       = keys_col<T>{};
+  auto const expected_M2s        = M2s_col<R>{};
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_keys, *out_keys, print_all);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_M2s, *out_M2s, print_all);
+}
+
+TYPED_TEST(GroupbyM2TypedTest, AllNullValuesInput)
+{
+  using T = TypeParam;
+  using R = cudf::detail::target_type_t<T, cudf::aggregation::M2>;
+
+  auto const keys = keys_col<T>{1, 2, 3};
+  auto const vals = vals_col<T>{{3, 4, 5}, all_nulls()};
+
+  auto const [out_keys, out_M2s] = compute_M2(keys, vals);
+  auto const expected_M2s        = M2s_col<R>{{null, null, null}, all_nulls()};
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(keys, *out_keys, print_all);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_M2s, *out_M2s, print_all);
+}
+
+TYPED_TEST(GroupbyM2TypedTest, SimpleInput)
+{
+  using T = TypeParam;
+  using R = cudf::detail::target_type_t<T, cudf::aggregation::M2>;
+
+  // key = 1: vals = [0, 3, 6]
+  // key = 2: vals = [1, 4, 5, 9]
+  // key = 3: vals = [2, 7, 8]
+  auto const keys = keys_col<T>{1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
+  auto const vals = vals_col<T>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+
+  auto const [out_keys, out_M2s] = compute_M2(keys, vals);
+  auto const expected_keys       = keys_col<T>{1, 2, 3};
+  auto const expected_M2s        = M2s_col<R>{18.0, 32.75, 20.0 + 2.0 / 3.0};
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_keys, *out_keys, print_all);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_M2s, *out_M2s, print_all);
+}
+
+TYPED_TEST(GroupbyM2TypedTest, SimpleInputHavingNegativeValues)
+{
+  using T = TypeParam;
+  using R = cudf::detail::target_type_t<T, cudf::aggregation::M2>;
+
+  // key = 1: vals = [0,  3, -6]
+  // key = 2: vals = [1, -4, -5, 9]
+  // key = 3: vals = [-2, 7, -8]
+  auto const keys = keys_col<T>{1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
+  auto const vals = vals_col<T>{0, 1, -2, 3, -4, -5, -6, 7, -8, 9};
+
+  auto const [out_keys, out_M2s] = compute_M2(keys, vals);
+  auto const expected_keys       = keys_col<T>{1, 2, 3};
+  auto const expected_M2s        = M2s_col<R>{42.0, 122.75, 114.0};
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_keys, *out_keys, print_all);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_M2s, *out_M2s, print_all);
+}
+
+TYPED_TEST(GroupbyM2TypedTest, ValuesHaveNulls)
+{
+  using T = TypeParam;
+  using R = cudf::detail::target_type_t<T, cudf::aggregation::M2>;
+
+  auto const keys = keys_col<T>{1, 2, 3, 4, 5, 2, 3, 2};
+  auto const vals = vals_col<T>{{0, null, 2, 3, null, 5, 6, 7}, nulls_at({1, 4})};
+
+  auto const [out_keys, out_M2s] = compute_M2(keys, vals);
+  auto const expected_keys       = keys_col<T>{1, 2, 3, 4, 5};
+  auto const expected_M2s        = M2s_col<R>{{0.0, 2.0, 8.0, 0.0, 0.0 /*NULL*/}, null_at(4)};
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_keys, *out_keys, print_all);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_M2s, *out_M2s, print_all);
+}
+
+TYPED_TEST(GroupbyM2TypedTest, KeysAndValuesHaveNulls)
+{
+  using T = TypeParam;
+  using R = cudf::detail::target_type_t<T, cudf::aggregation::M2>;
+
+  // key = 1: vals = [null, 3, 6]
+  // key = 2: vals = [1, 4, null, 9]
+  // key = 3: vals = [2, 8]
+  // key = 4: vals = [null]
+  auto const keys = keys_col<T>{{1, 2, 3, 1, 2, 2, 1, null, 3, 2, 4}, null_at(7)};
+  auto const vals = vals_col<T>{{null, 1, 2, 3, 4, null, 6, 7, 8, 9, null}, nulls_at({0, 5, 10})};
+
+  auto const [out_keys, out_M2s] = compute_M2(keys, vals);
+  auto const expected_keys       = keys_col<T>{1, 2, 3, 4};
+  auto const expected_M2s = M2s_col<R>{{4.5, 32.0 + 2.0 / 3.0, 18.0, 0.0 /*NULL*/}, null_at(3)};
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_keys, *out_keys, print_all);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_M2s, *out_M2s, print_all);
+}
+
+TYPED_TEST(GroupbyM2TypedTest, InputHaveNullsAndNaNs)
+{
+  using T = TypeParam;
+  using R = cudf::detail::target_type_t<T, cudf::aggregation::M2>;
+
+  // key = 1: vals = [0, 3, 6]
+  // key = 2: vals = [1, 4, NaN, 9]
+  // key = 3: vals = [null, 2, 8]
+  // key = 4: vals = [null, 10, NaN]
+  auto const keys = keys_col<T>{{4, 3, 1, 2, 3, 1, 2, 2, 1, null, 3, 2, 4, 4}, null_at(9)};
+  auto const vals = vals_col<double>{
+    {0.0 /*NULL*/, 0.0 /*NULL*/, 0.0, 1.0, 2.0, 3.0, 4.0, NaN, 6.0, 7.0, 8.0, 9.0, 10.0, NaN},
+    nulls_at({0, 1})};
+
+  auto const [out_keys, out_M2s] = compute_M2(keys, vals);
+  auto const expected_keys       = keys_col<T>{1, 2, 3, 4};
+  auto const expected_M2s        = M2s_col<R>{18.0, NaN, 18.0, NaN};
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_keys, *out_keys, print_all);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_M2s, *out_M2s, print_all);
+}
+
+TYPED_TEST(GroupbyM2TypedTest, SlicedColumnsInput)
+{
+  using T = TypeParam;
+  using R = cudf::detail::target_type_t<T, cudf::aggregation::M2>;
+
+  // This test should compute M2 aggregation on the same dataset as the InputHaveNullsAndNaNs test.
+  // i.e.:
+  //
+  // key = 1: vals = [0, 3, 6]
+  // key = 2: vals = [1, 4, NaN, 9]
+  // key = 3: vals = [null, 2, 8]
+  // key = 4: vals = [null, 10, NaN]
+
+  auto const keys_original =
+    keys_col<T>{{
+                  1, 2, 3, 4, 5, 1, 2, 3, 4, 5,                 // will not use, don't care
+                  4, 3, 1, 2, 3, 1, 2, 2, 1, null, 3, 2, 4, 4,  // use this
+                  1, 2, 3, 4, 5, 1, 2, 3, 4, 5                  // will not use, don't care
+                },
+                null_at(19)};
+  auto const vals_original = vals_col<double>{
+    {
+      3.0, 2.0,  5.0,  4.0,  6.0, 9.0, 1.0, 0.0,  1.0,  7.0,  // will not use, don't care
+      0.0, 0.0,  0.0,  1.0,  2.0, 3.0, 4.0, NaN,  6.0,  7.0, 8.0, 9.0, 10.0, NaN,  // use this
+      9.0, 10.0, 11.0, 12.0, 0.0, 5.0, 1.0, 20.0, 19.0, 15.0  // will not use, don't care
+    },
+    nulls_at({10, 11})};
+
+  auto const keys = cudf::slice(keys_original, {10, 24})[0];
+  auto const vals = cudf::slice(vals_original, {10, 24})[0];
+
+  auto const [out_keys, out_M2s] = compute_M2(keys, vals);
+  auto const expected_keys       = keys_col<T>{1, 2, 3, 4};
+  auto const expected_M2s        = M2s_col<R>{18.0, NaN, 18.0, NaN};
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_keys, *out_keys, print_all);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_M2s, *out_M2s, print_all);
+}
diff --git a/cpp/tests/groupby/max_tests.cpp b/cpp/tests/groupby/max_tests.cpp
index e0da55b080f..b5710d3f4bc 100644
--- a/cpp/tests/groupby/max_tests.cpp
+++ b/cpp/tests/groupby/max_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -162,6 +162,42 @@ TEST_F(groupby_max_string_test, zero_valid_values)
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
 }
 
+TEST_F(groupby_max_string_test, max_sorted_strings)
+{
+  // testcase replicated in issue #8717
+  cudf::test::strings_column_wrapper keys(
+    {"",   "",   "",   "",   "",   "",   "06", "06", "06", "06", "10", "10", "10", "10", "14", "14",
+     "14", "14", "18", "18", "18", "18", "22", "22", "22", "22", "26", "26", "26", "26", "30", "30",
+     "30", "30", "34", "34", "34", "34", "38", "38", "38", "38", "42", "42", "42", "42"},
+    {0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
+  cudf::test::strings_column_wrapper vals(
+    {"", "", "",   "", "", "", "06", "", "", "", "10", "", "", "", "14", "",
+     "", "", "18", "", "", "", "22", "", "", "", "26", "", "", "", "30", "",
+     "", "", "34", "", "", "", "38", "", "", "", "42", "", "", ""},
+    {0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
+     0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0});
+  cudf::test::strings_column_wrapper expect_keys(
+    {"06", "10", "14", "18", "22", "26", "30", "34", "38", "42", ""},
+    {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0});
+  cudf::test::strings_column_wrapper expect_vals(
+    {"06", "10", "14", "18", "22", "26", "30", "34", "38", "42", ""},
+    {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0});
+
+  // fixed_width_column_wrapper<size_type> expect_argmax(
+  // {6, 10, 14, 18, 22, 26, 30, 34, 38, 42, -1},
+  // {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0});
+  auto agg = cudf::make_max_aggregation();
+  test_single_agg(keys,
+                  vals,
+                  expect_keys,
+                  expect_vals,
+                  std::move(agg),
+                  force_use_sort_impl::NO,
+                  null_policy::INCLUDE,
+                  sorted::YES);
+}
+
 struct groupby_dictionary_max_test : public cudf::test::BaseFixture {
 };
 
diff --git a/cpp/tests/groupby/merge_m2_tests.cpp b/cpp/tests/groupby/merge_m2_tests.cpp
new file mode 100644
index 00000000000..63451f9612d
--- /dev/null
+++ b/cpp/tests/groupby/merge_m2_tests.cpp
@@ -0,0 +1,479 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/aggregation.hpp>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/table/table_view.hpp>
+
+using namespace cudf::test::iterators;
+
+namespace {
+constexpr bool print_all{false};                                 // For debugging
+constexpr int32_t null{0};                                       // Mark for null elements
+constexpr double NaN{std::numeric_limits<double>::quiet_NaN()};  // Mark for NaN double elements
+
+template <class T>
+using keys_col = cudf::test::fixed_width_column_wrapper<T, int32_t>;
+
+template <class T>
+using vals_col = cudf::test::fixed_width_column_wrapper<T>;
+
+using counts_col = cudf::test::fixed_width_column_wrapper<int32_t>;
+
+template <class T>
+using means_col = cudf::test::fixed_width_column_wrapper<T>;
+
+template <class T>
+using M2s_col = cudf::test::fixed_width_column_wrapper<T>;
+
+using structs_col = cudf::test::structs_column_wrapper;
+using vcol_views  = std::vector<cudf::column_view>;
+
+/**
+ * @brief Compute `COUNT_VALID`, `MEAN`, `M2` aggregations for the given values columns.
+ * @return A pair of unique keys column and a structs column containing the computed values of
+ *         (`COUNT_VALID`, `MEAN`, `M2`).
+ */
+auto compute_partial_results(cudf::column_view const& keys, cudf::column_view const& values)
+{
+  std::vector<cudf::groupby::aggregation_request> requests;
+  requests.emplace_back(cudf::groupby::aggregation_request());
+  requests[0].values = values;
+  requests[0].aggregations.emplace_back(cudf::make_count_aggregation());
+  requests[0].aggregations.emplace_back(cudf::make_mean_aggregation());
+  requests[0].aggregations.emplace_back(cudf::make_m2_aggregation());
+
+  auto gb_obj                  = cudf::groupby::groupby(cudf::table_view({keys}));
+  auto [out_keys, out_results] = gb_obj.aggregate(requests);
+
+  auto const num_output_rows = out_keys->num_rows();
+  return std::make_pair(
+    std::move(out_keys->release()[0]),
+    cudf::make_structs_column(
+      num_output_rows, std::move(out_results[0].results), 0, rmm::device_buffer{}));
+}
+
+/**
+ * @brief Perform merging for partial results of M2 aggregations.
+ *
+ * @return A pair of unique keys column and a structs column containing the merged values of
+ *         (`COUNT_VALID`, `MEAN`, `M2`).
+ */
+auto merge_M2(vcol_views const& keys_cols, vcol_views const& values_cols)
+{
+  // Append all the keys and values together.
+  auto const keys   = cudf::concatenate(keys_cols);
+  auto const values = cudf::concatenate(values_cols);
+
+  std::vector<cudf::groupby::aggregation_request> requests;
+  requests.emplace_back(cudf::groupby::aggregation_request());
+  requests[0].values = *values;
+  requests[0].aggregations.emplace_back(cudf::make_merge_m2_aggregation());
+
+  auto gb_obj = cudf::groupby::groupby(cudf::table_view({*keys}));
+  auto result = gb_obj.aggregate(requests);
+  return std::make_pair(std::move(result.first->release()[0]),
+                        std::move(result.second[0].results[0]));
+}
+}  // namespace
+
+template <class T>
+struct GroupbyMergeM2TypedTest : public cudf::test::BaseFixture {
+};
+
+using TestTypes = cudf::test::Concat<cudf::test::Types<int8_t, int16_t, int32_t, int64_t>,
+                                     cudf::test::FloatingPointTypes>;
+TYPED_TEST_SUITE(GroupbyMergeM2TypedTest, TestTypes);
+
+TYPED_TEST(GroupbyMergeM2TypedTest, InvalidInput)
+{
+  using T = TypeParam;
+
+  auto const keys = keys_col<T>{1, 2, 3};
+
+  // The input column must be a structs column.
+  {
+    auto const values = keys_col<T>{1, 2, 3};
+    EXPECT_THROW(merge_M2({keys}, {values}), cudf::logic_error);
+  }
+
+  // The input column must be a structs column having 3 children.
+  {
+    auto vals1      = keys_col<T>{1, 2, 3};
+    auto vals2      = vals_col<double>{1.0, 2.0, 3.0};
+    auto const vals = structs_col{vals1, vals2};
+    EXPECT_THROW(merge_M2({keys}, {vals}), cudf::logic_error);
+  }
+
+  // The input column must be a structs column having types (int32_t, double, double).
+  {
+    auto vals1      = keys_col<T>{1, 2, 3};
+    auto vals2      = keys_col<T>{1, 2, 3};
+    auto vals3      = keys_col<T>{1, 2, 3};
+    auto const vals = structs_col{vals1, vals2, vals3};
+    EXPECT_THROW(merge_M2({keys}, {vals}), cudf::logic_error);
+  }
+}
+
+TYPED_TEST(GroupbyMergeM2TypedTest, EmptyInput)
+{
+  using T      = TypeParam;
+  using M2_t   = cudf::detail::target_type_t<T, cudf::aggregation::M2>;
+  using mean_t = cudf::detail::target_type_t<T, cudf::aggregation::MEAN>;
+
+  auto const keys = keys_col<T>{};
+  auto vals_count = counts_col{};
+  auto vals_mean  = means_col<mean_t>{};
+  auto vals_M2    = M2s_col<M2_t>{};
+  auto const vals = structs_col{vals_count, vals_mean, vals_M2};
+
+  auto const [out_keys, out_vals] = merge_M2({keys}, {vals});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(keys, *out_keys, print_all);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(vals, *out_vals, print_all);
+}
+
+TYPED_TEST(GroupbyMergeM2TypedTest, SimpleInput)
+{
+  using T = TypeParam;
+  using R = cudf::detail::target_type_t<T, cudf::aggregation::M2>;
+
+  // Full dataset:
+  //
+  // keys = [1, 2, 3, 1, 2, 2, 1, 3, 3, 2]
+  // vals = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
+  //
+  // key = 1: vals = [0, 3, 6]
+  // key = 2: vals = [1, 4, 5, 9]
+  // key = 3: vals = [2, 7, 8]
+
+  // Partitioned datasets:
+  auto const keys1 = keys_col<T>{1, 2, 3};
+  auto const keys2 = keys_col<T>{1, 2, 2};
+  auto const keys3 = keys_col<T>{1, 3, 3, 2};
+
+  auto const vals1 = vals_col<T>{0, 1, 2};
+  auto const vals2 = vals_col<T>{3, 4, 5};
+  auto const vals3 = vals_col<T>{6, 7, 8, 9};
+
+  // The expected results to validate.
+  auto const expected_keys = keys_col<T>{1, 2, 3};
+  auto const expected_M2s  = M2s_col<R>{18.0, 32.75, 20.0 + 2.0 / 3.0};
+
+  // Compute partial results (`COUNT_VALID`, `MEAN`, `M2`) of each dataset.
+  // The partial results are also assembled into a structs column.
+  auto const [out1_keys, out1_vals] = compute_partial_results(keys1, vals1);
+  auto const [out2_keys, out2_vals] = compute_partial_results(keys2, vals2);
+  auto const [out3_keys, out3_vals] = compute_partial_results(keys3, vals3);
+
+  // Merge the partial results to the final results.
+  // Merging can be done in just one merge step, or in multiple steps.
+
+  // Multiple steps merging:
+  {
+    auto const [out4_keys, out4_vals] =
+      merge_M2(vcol_views{*out1_keys, *out2_keys}, vcol_views{*out1_vals, *out2_vals});
+    auto const [final_keys, final_vals] =
+      merge_M2(vcol_views{*out3_keys, *out4_keys}, vcol_views{*out3_vals, *out4_vals});
+
+    auto const out_M2s = final_vals->child(2);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_keys, *final_keys, print_all);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_M2s, out_M2s, print_all);
+  }
+
+  // One step merging:
+  {
+    auto const [final_keys, final_vals] = merge_M2(vcol_views{*out1_keys, *out2_keys, *out3_keys},
+                                                   vcol_views{*out1_vals, *out2_vals, *out3_vals});
+
+    auto const out_M2s = final_vals->child(2);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_keys, *final_keys, print_all);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_M2s, out_M2s, print_all);
+  }
+}
+
+TYPED_TEST(GroupbyMergeM2TypedTest, SimpleInputHavingNegativeValues)
+{
+  using T = TypeParam;
+  using R = cudf::detail::target_type_t<T, cudf::aggregation::M2>;
+
+  // Full dataset:
+  //
+  // keys = [1, 2,  3, 1,  2,  2,  1, 3,  3, 2]
+  // vals = [0, 1, -2, 3, -4, -5, -6, 7, -8, 9]
+  //
+  // key = 1: vals = [0,  3, -6]
+  // key = 2: vals = [1, -4, -5, 9]
+  // key = 3: vals = [-2, 7, -8]
+
+  // Partitioned datasets:
+  auto const keys1 = keys_col<T>{1, 2, 3};
+  auto const keys2 = keys_col<T>{1, 2, 2};
+  auto const keys3 = keys_col<T>{1, 3, 3, 2};
+
+  auto const vals1 = vals_col<T>{0, 1, -2};
+  auto const vals2 = vals_col<T>{3, -4, -5};
+  auto const vals3 = vals_col<T>{-6, 7, -8, 9};
+
+  // The expected results to validate.
+  auto const expected_keys = keys_col<T>{1, 2, 3};
+  auto const expected_M2s  = M2s_col<R>{42.0, 122.75, 114.0};
+
+  // Compute partial results (`COUNT_VALID`, `MEAN`, `M2`) of each dataset.
+  // The partial results are also assembled into a structs column.
+  auto const [out1_keys, out1_vals] = compute_partial_results(keys1, vals1);
+  auto const [out2_keys, out2_vals] = compute_partial_results(keys2, vals2);
+  auto const [out3_keys, out3_vals] = compute_partial_results(keys3, vals3);
+
+  // Merge the partial results to the final results.
+  // Merging can be done in just one merge step, or in multiple steps.
+
+  // Multiple steps merging:
+  {
+    auto const [out4_keys, out4_vals] =
+      merge_M2(vcol_views{*out1_keys, *out2_keys}, vcol_views{*out1_vals, *out2_vals});
+    auto const [final_keys, final_vals] =
+      merge_M2(vcol_views{*out3_keys, *out4_keys}, vcol_views{*out3_vals, *out4_vals});
+
+    auto const out_M2s = final_vals->child(2);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_keys, *final_keys, print_all);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_M2s, out_M2s, print_all);
+  }
+
+  // One step merging:
+  {
+    auto const [final_keys, final_vals] = merge_M2(vcol_views{*out1_keys, *out2_keys, *out3_keys},
+                                                   vcol_views{*out1_vals, *out2_vals, *out3_vals});
+
+    auto const out_M2s = final_vals->child(2);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_keys, *final_keys, print_all);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_M2s, out_M2s, print_all);
+  }
+}
+
+TYPED_TEST(GroupbyMergeM2TypedTest, InputHasNulls)
+{
+  using T = TypeParam;
+  using R = cudf::detail::target_type_t<T, cudf::aggregation::M2>;
+
+  // Full dataset:
+  //
+  // keys = [1,    2, 3, 1, 2, 2,    1, null, 3, 2, 4]
+  // vals = [null, 1, 2, 3, 4, null, 6, 7,    8, 9, null]
+  //
+  // key = 1: vals = [null, 3, 6]
+  // key = 2: vals = [1, 4, null, 9]
+  // key = 3: vals = [2, 8]
+  // key = 4: vals = [null]
+
+  // Partitioned datasets:
+  auto const keys1 = keys_col<T>{1, 2, 3, 1};
+  auto const keys2 = keys_col<T>{{2, 2, 1, null}, null_at(3)};
+  auto const keys3 = keys_col<T>{3, 2, 4};
+
+  auto const vals1 = vals_col<T>{{null, 1, 2, 3}, null_at(0)};
+  auto const vals2 = vals_col<T>{{4, null, 6, 7}, null_at(1)};
+  auto const vals3 = vals_col<T>{{8, 9, null}, null_at(2)};
+
+  // The expected results to validate.
+  auto const expected_keys = keys_col<T>{1, 2, 3, 4};
+  auto const expected_M2s  = M2s_col<R>{{4.5, 32.0 + 2.0 / 3.0, 18.0, 0.0 /*NULL*/}, null_at(3)};
+
+  // Compute partial results (`COUNT_VALID`, `MEAN`, `M2`) of each dataset.
+  // The partial results are also assembled into a structs column.
+  auto const [out1_keys, out1_vals] = compute_partial_results(keys1, vals1);
+  auto const [out2_keys, out2_vals] = compute_partial_results(keys2, vals2);
+  auto const [out3_keys, out3_vals] = compute_partial_results(keys3, vals3);
+
+  // Merge the partial results to the final results.
+  // Merging can be done in just one merge step, or in multiple steps.
+
+  // Multiple steps merging:
+  {
+    auto const [out4_keys, out4_vals] =
+      merge_M2(vcol_views{*out1_keys, *out2_keys}, vcol_views{*out1_vals, *out2_vals});
+    auto const [final_keys, final_vals] =
+      merge_M2(vcol_views{*out3_keys, *out4_keys}, vcol_views{*out3_vals, *out4_vals});
+
+    auto const out_M2s = final_vals->child(2);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_keys, *final_keys, print_all);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_M2s, out_M2s, print_all);
+  }
+
+  // One step merging:
+  {
+    auto const [final_keys, final_vals] = merge_M2(vcol_views{*out1_keys, *out2_keys, *out3_keys},
+                                                   vcol_views{*out1_vals, *out2_vals, *out3_vals});
+
+    auto const out_M2s = final_vals->child(2);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_keys, *final_keys, print_all);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_M2s, out_M2s, print_all);
+  }
+}
+
+TYPED_TEST(GroupbyMergeM2TypedTest, InputHaveNullsAndNaNs)
+{
+  using T = TypeParam;
+  using R = cudf::detail::target_type_t<T, cudf::aggregation::M2>;
+
+  // Full dataset:
+  //
+  // keys = [4, 3, 1, 2, 3, 1, 2, 2, 1, null, 3, 2, 4, 4]
+  // vals = [null, null, 0.0, 1.0, 2.0, 3.0, 4.0, NaN, 6.0, 7.0, 8.0, 9.0, 10.0, NaN]
+  //
+  // key = 1: vals = [0, 3, 6]
+  // key = 2: vals = [1, 4, NaN, 9]
+  // key = 3: vals = [null, 2, 8]
+  // key = 4: vals = [null, 10, NaN]
+
+  // Partitioned datasets:
+  auto const keys1 = keys_col<T>{4, 3, 1, 2};
+  auto const keys2 = keys_col<T>{3, 1, 2};
+  auto const keys3 = keys_col<T>{{2, 1, null}, null_at(2)};
+  auto const keys4 = keys_col<T>{3, 2, 4, 4};
+
+  auto const vals1 = vals_col<double>{{0.0 /*NULL*/, 0.0 /*NULL*/, 0.0, 1.0}, nulls_at({0, 1})};
+  auto const vals2 = vals_col<double>{2.0, 3.0, 4.0};
+  auto const vals3 = vals_col<double>{NaN, 6.0, 7.0};
+  auto const vals4 = vals_col<double>{8.0, 9.0, 10.0, NaN};
+
+  // The expected results to validate.
+  auto const expected_keys = keys_col<T>{1, 2, 3, 4};
+  auto const expected_M2s  = M2s_col<R>{18.0, NaN, 18.0, NaN};
+
+  // Compute partial results (`COUNT_VALID`, `MEAN`, `M2`) of each dataset.
+  // The partial results are also assembled into a structs column.
+  auto const [out1_keys, out1_vals] = compute_partial_results(keys1, vals1);
+  auto const [out2_keys, out2_vals] = compute_partial_results(keys2, vals2);
+  auto const [out3_keys, out3_vals] = compute_partial_results(keys3, vals3);
+  auto const [out4_keys, out4_vals] = compute_partial_results(keys4, vals4);
+
+  // Merge the partial results to the final results.
+  // Merging can be done in just one merge step, or in multiple steps.
+
+  // Multiple steps merging:
+  {
+    auto const [out5_keys, out5_vals] =
+      merge_M2(vcol_views{*out1_keys, *out2_keys}, vcol_views{*out1_vals, *out2_vals});
+    auto const [out6_keys, out6_vals] =
+      merge_M2(vcol_views{*out3_keys, *out4_keys}, vcol_views{*out3_vals, *out4_vals});
+    auto const [final_keys, final_vals] =
+      merge_M2(vcol_views{*out5_keys, *out6_keys}, vcol_views{*out5_vals, *out6_vals});
+
+    auto const out_M2s = final_vals->child(2);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_keys, *final_keys, print_all);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_M2s, out_M2s, print_all);
+  }
+
+  // One step merging:
+  {
+    auto const [final_keys, final_vals] =
+      merge_M2(vcol_views{*out1_keys, *out2_keys, *out3_keys, *out4_keys},
+               vcol_views{*out1_vals, *out2_vals, *out3_vals, *out4_vals});
+
+    auto const out_M2s = final_vals->child(2);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_keys, *final_keys, print_all);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_M2s, out_M2s, print_all);
+  }
+}
+
+TYPED_TEST(GroupbyMergeM2TypedTest, SlicedColumnsInput)
+{
+  using T = TypeParam;
+  using R = cudf::detail::target_type_t<T, cudf::aggregation::M2>;
+
+  // This test should compute M2 aggregation on the same dataset as the InputHaveNullsAndNaNs test.
+  // i.e.:
+  //
+  // keys = [4, 3, 1, 2, 3, 1, 2, 2, 1, null, 3, 2, 4, 4]
+  // vals = [null, null, 0.0, 1.0, 2.0, 3.0, 4.0, NaN, 6.0, 7.0, 8.0, 9.0, 10.0, NaN]
+  //
+  // key = 1: vals = [0, 3, 6]
+  // key = 2: vals = [1, 4, NaN, 9]
+  // key = 3: vals = [null, 2, 8]
+  // key = 4: vals = [null, 10, NaN]
+
+  auto const keys_original =
+    keys_col<T>{{
+                  1, 2, 3, 4, 5, 1, 2, 3, 4, 5,                 // will not use, don't care
+                  4, 3, 1, 2, 3, 1, 2, 2, 1, null, 3, 2, 4, 4,  // use this
+                  1, 2, 3, 4, 5, 1, 2, 3, 4, 5                  // will not use, don't care
+                },
+                null_at(19)};
+  auto const vals_original = vals_col<double>{
+    {
+      3.0, 2.0,  5.0,  4.0,  6.0, 9.0, 1.0, 0.0,  1.0,  7.0,  // will not use, don't care
+      0.0, 0.0,  0.0,  1.0,  2.0, 3.0, 4.0, NaN,  6.0,  7.0, 8.0, 9.0, 10.0, NaN,  // use this
+      9.0, 10.0, 11.0, 12.0, 0.0, 5.0, 1.0, 20.0, 19.0, 15.0  // will not use, don't care
+    },
+    nulls_at({10, 11})};
+
+  // Partitioned datasets, taken from the original dataset in the range [10, 24).
+  auto const keys1 = cudf::slice(keys_original, {10, 14})[0];  // {4, 3, 1, 2}
+  auto const keys2 = cudf::slice(keys_original, {14, 17})[0];  // {3, 1, 2}
+  auto const keys3 = cudf::slice(keys_original, {17, 20})[0];  // {2, 1, null}
+  auto const keys4 = cudf::slice(keys_original, {20, 24})[0];  // {3, 2, 4, 4}
+
+  auto const vals1 = cudf::slice(vals_original, {10, 14})[0];  // {null, null, 0.0, 1.0}
+  auto const vals2 = cudf::slice(vals_original, {14, 17})[0];  // {2.0, 3.0, 4.0}
+  auto const vals3 = cudf::slice(vals_original, {17, 20})[0];  // {NaN, 6.0, 7.0}
+  auto const vals4 = cudf::slice(vals_original, {20, 24})[0];  // {8.0, 9.0, 10.0, NaN}
+
+  // The expected results to validate.
+  auto const expected_keys = keys_col<T>{1, 2, 3, 4};
+  auto const expected_M2s  = M2s_col<R>{18.0, NaN, 18.0, NaN};
+
+  // Compute partial results (`COUNT_VALID`, `MEAN`, `M2`) of each dataset.
+  // The partial results are also assembled into a structs column.
+  auto const [out1_keys, out1_vals] = compute_partial_results(keys1, vals1);
+  auto const [out2_keys, out2_vals] = compute_partial_results(keys2, vals2);
+  auto const [out3_keys, out3_vals] = compute_partial_results(keys3, vals3);
+  auto const [out4_keys, out4_vals] = compute_partial_results(keys4, vals4);
+
+  // Merge the partial results to the final results.
+  // Merging can be done in just one merge step, or in multiple steps.
+
+  // Multiple steps merging:
+  {
+    auto const [out5_keys, out5_vals] =
+      merge_M2(vcol_views{*out1_keys, *out2_keys}, vcol_views{*out1_vals, *out2_vals});
+    auto const [out6_keys, out6_vals] =
+      merge_M2(vcol_views{*out3_keys, *out4_keys}, vcol_views{*out3_vals, *out4_vals});
+    auto const [final_keys, final_vals] =
+      merge_M2(vcol_views{*out5_keys, *out6_keys}, vcol_views{*out5_vals, *out6_vals});
+
+    auto const out_M2s = final_vals->child(2);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_keys, *final_keys, print_all);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_M2s, out_M2s, print_all);
+  }
+
+  // One step merging:
+  {
+    auto const [final_keys, final_vals] =
+      merge_M2(vcol_views{*out1_keys, *out2_keys, *out3_keys, *out4_keys},
+               vcol_views{*out1_vals, *out2_vals, *out3_vals, *out4_vals});
+
+    auto const out_M2s = final_vals->child(2);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_keys, *final_keys, print_all);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_M2s, out_M2s, print_all);
+  }
+}
diff --git a/cpp/tests/groupby/min_tests.cpp b/cpp/tests/groupby/min_tests.cpp
index 8f997875a78..1544e867595 100644
--- a/cpp/tests/groupby/min_tests.cpp
+++ b/cpp/tests/groupby/min_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -162,6 +162,42 @@ TEST_F(groupby_min_string_test, zero_valid_values)
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
 }
 
+TEST_F(groupby_min_string_test, min_sorted_strings)
+{
+  // testcase replicated in issue #8717
+  cudf::test::strings_column_wrapper keys(
+    {"",   "",   "",   "",   "",   "",   "06", "06", "06", "06", "10", "10", "10", "10", "14", "14",
+     "14", "14", "18", "18", "18", "18", "22", "22", "22", "22", "26", "26", "26", "26", "30", "30",
+     "30", "30", "34", "34", "34", "34", "38", "38", "38", "38", "42", "42", "42", "42"},
+    {0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
+  cudf::test::strings_column_wrapper vals(
+    {"", "", "",   "", "", "", "06", "", "", "", "10", "", "", "", "14", "",
+     "", "", "18", "", "", "", "22", "", "", "", "26", "", "", "", "30", "",
+     "", "", "34", "", "", "", "38", "", "", "", "42", "", "", ""},
+    {0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
+     0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0});
+  cudf::test::strings_column_wrapper expect_keys(
+    {"06", "10", "14", "18", "22", "26", "30", "34", "38", "42", ""},
+    {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0});
+  cudf::test::strings_column_wrapper expect_vals(
+    {"06", "10", "14", "18", "22", "26", "30", "34", "38", "42", ""},
+    {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0});
+
+  // fixed_width_column_wrapper<size_type> expect_argmin(
+  // {6, 10, 14, 18, 22, 26, 30, 34, 38, 42, -1},
+  // {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0});
+  auto agg = cudf::make_min_aggregation();
+  test_single_agg(keys,
+                  vals,
+                  expect_keys,
+                  expect_vals,
+                  std::move(agg),
+                  force_use_sort_impl::NO,
+                  null_policy::INCLUDE,
+                  sorted::YES);
+}
+
 struct groupby_dictionary_min_test : public cudf::test::BaseFixture {
 };
 
diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp
index 8996dd95e06..4e1ad57080a 100644
--- a/cpp/tests/io/csv_test.cpp
+++ b/cpp/tests/io/csv_test.cpp
@@ -171,7 +171,7 @@ void check_float_column(cudf::column_view const& col_lhs,
 }
 
 // timestamp column checker within tolerance
-// given by `tol_ms` (miliseconds)
+// given by `tol_ms` (milliseconds)
 void check_timestamp_column(cudf::column_view const& col_lhs,
                             cudf::column_view const& col_rhs,
                             long tol_ms = 1000l)
@@ -527,6 +527,42 @@ TEST_F(CsvReaderTest, MultiColumn)
   expect_column_data_equal(float64_values, view.column(14));
 }
 
+TEST_F(CsvReaderTest, RepeatColumn)
+{
+  constexpr auto num_rows = 10;
+  auto int16_values       = random_values<int16_t>(num_rows);
+  auto int64_values       = random_values<int64_t>(num_rows);
+  auto uint64_values      = random_values<uint64_t>(num_rows);
+  auto float32_values     = random_values<float>(num_rows);
+
+  auto filepath = temp_env->get_temp_dir() + "RepeatColumn.csv";
+  {
+    std::ostringstream line;
+    for (int i = 0; i < num_rows; ++i) {
+      line << int16_values[i] << "," << int64_values[i] << "," << uint64_values[i] << ","
+           << float32_values[i] << "\n";
+    }
+    std::ofstream outfile(filepath, std::ofstream::out);
+    outfile << line.str();
+  }
+
+  // repeats column in indexes and names, misses 1 column.
+  cudf_io::csv_reader_options in_opts =
+    cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
+      .dtypes(std::vector<std::string>{"int16", "int64", "uint64", "float"})
+      .names({"A", "B", "C", "D"})
+      .use_cols_indexes({1, 0, 0})
+      .use_cols_names({"D", "B", "B"})
+      .header(-1);
+  auto result = cudf_io::read_csv(in_opts);
+
+  const auto view = result.tbl->view();
+  EXPECT_EQ(3, view.num_columns());
+  expect_column_data_equal(int16_values, view.column(0));
+  expect_column_data_equal(int64_values, view.column(1));
+  expect_column_data_equal(float32_values, view.column(2));
+}
+
 TEST_F(CsvReaderTest, Booleans)
 {
   auto filepath = temp_env->get_temp_dir() + "Booleans.csv";
@@ -751,7 +787,9 @@ TEST_F(CsvReaderTest, IntegersCastToTimestampSeconds)
     column_wrapper<cudf::timestamp_s, cudf::timestamp_s::rep>(input_vals.begin(), input_vals.end());
   {
     std::ofstream outfile(filepath, std::ofstream::out);
-    for (auto v : input_vals) { outfile << v << "\n"; }
+    for (auto v : input_vals) {
+      outfile << v << "\n";
+    }
   }
 
   cudf_io::csv_reader_options in_opts =
@@ -778,7 +816,9 @@ TEST_F(CsvReaderTest, IntegersCastToTimestampMilliSeconds)
     input_vals.begin(), input_vals.end());
   {
     std::ofstream outfile(filepath, std::ofstream::out);
-    for (auto v : input_vals) { outfile << v << "\n"; }
+    for (auto v : input_vals) {
+      outfile << v << "\n";
+    }
   }
 
   cudf_io::csv_reader_options in_opts =
@@ -805,7 +845,9 @@ TEST_F(CsvReaderTest, IntegersCastToTimestampMicroSeconds)
     input_vals.begin(), input_vals.end());
   {
     std::ofstream outfile(filepath, std::ofstream::out);
-    for (auto v : input_vals) { outfile << v << "\n"; }
+    for (auto v : input_vals) {
+      outfile << v << "\n";
+    }
   }
 
   cudf_io::csv_reader_options in_opts =
@@ -832,7 +874,9 @@ TEST_F(CsvReaderTest, IntegersCastToTimestampNanoSeconds)
     input_vals.begin(), input_vals.end());
   {
     std::ofstream outfile(filepath, std::ofstream::out);
-    for (auto v : input_vals) { outfile << v << "\n"; }
+    for (auto v : input_vals) {
+      outfile << v << "\n";
+    }
   }
 
   cudf_io::csv_reader_options in_opts =
@@ -1153,7 +1197,8 @@ TEST_F(CsvReaderTest, InvalidFloatingPoint)
 
   const auto col_data = cudf::test::to_host<float>(view.column(0));
   // col_data.first contains the column data
-  for (const auto& elem : col_data.first) ASSERT_TRUE(std::isnan(elem));
+  for (const auto& elem : col_data.first)
+    ASSERT_TRUE(std::isnan(elem));
   // col_data.second contains the bitmasks
   ASSERT_EQ(0u, col_data.second[0]);
 }
diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp
index 4eed81298a2..56573ddab40 100644
--- a/cpp/tests/io/orc_test.cpp
+++ b/cpp/tests/io/orc_test.cpp
@@ -526,7 +526,7 @@ TEST_F(OrcWriterTest, Strings)
 
 TEST_F(OrcWriterTest, SlicedTable)
 {
-  // This test checks for writing zero copy, offseted views into existing cudf tables
+  // This test checks for writing zero copy, offsetted views into existing cudf tables
 
   std::vector<const char*> strings{
     "Monday", "Monday", "Friday", "Monday", "Friday", "Friday", "Friday", "Funday"};
@@ -609,7 +609,7 @@ TEST_F(OrcWriterTest, HostBuffer)
 TEST_F(OrcWriterTest, negTimestampsNano)
 {
   // This is a separate test because ORC format has a bug where writing a timestamp between -1 and 0
-  // seconds from UNIX epoch is read as that timestamp + 1 second. We mimic that behaviour and so
+  // seconds from UNIX epoch is read as that timestamp + 1 second. We mimic that behavior and so
   // this test has to hardcode test values which are < -1 second.
   // Details: https://github.com/rapidsai/cudf/pull/5529#issuecomment-648768925
   using namespace cudf::test;
diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp
index e59a4accf66..1ad844d6706 100644
--- a/cpp/tests/io/parquet_test.cpp
+++ b/cpp/tests/io/parquet_test.cpp
@@ -526,7 +526,7 @@ TEST_F(ParquetWriterTest, MultiColumnWithNulls)
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view());
   // TODO: Need to be able to return metadata in tree form from reader so they can be compared.
-  // Unfortunately the closest thing to a heirarchical schema is column_name_info which does not
+  // Unfortunately the closest thing to a hierarchical schema is column_name_info which does not
   // have any tests for it c++ or python.
   compare_metadata_equality(expected_metadata, result.metadata);
 }
@@ -573,7 +573,7 @@ TEST_F(ParquetWriterTest, Strings)
 
 TEST_F(ParquetWriterTest, SlicedTable)
 {
-  // This test checks for writing zero copy, offseted views into existing cudf tables
+  // This test checks for writing zero copy, offsetted views into existing cudf tables
 
   std::vector<const char*> strings{
     "Monday", "Wȅdnȅsday", "Friday", "Monday", "Friday", "Friday", "Friday", "Funday"};
@@ -1569,7 +1569,7 @@ TEST_F(ParquetChunkedWriterTest, ReadingUnclosedFile)
   srand(31337);
   auto table = create_random_fixed_table<int>(4, 4, true);
 
-  auto filepath = temp_env->get_temp_filepath("ReadingUnlosedFile.parquet");
+  auto filepath = temp_env->get_temp_filepath("ReadingUnclosedFile.parquet");
   cudf_io::chunked_parquet_writer_options args =
     cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath});
   cudf_io::parquet_chunked_writer writer(args);
@@ -1718,7 +1718,9 @@ TEST_F(ParquetChunkedWriterTest, ForcedNullability)
   // and considers all columns nullable. However cudf::concatenate will not force nulls in case no
   // columns are nullable. To get the expected result, we tell the writer the nullability of all
   // columns in advance.
-  for (auto& col_meta : metadata.column_metadata) { col_meta.set_nullability(false); }
+  for (auto& col_meta : metadata.column_metadata) {
+    col_meta.set_nullability(false);
+  }
 
   cudf_io::chunked_parquet_writer_options args =
     cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath})
@@ -2286,7 +2288,7 @@ TEST_F(ParquetReaderTest, UserBoundsWithNulls)
 
   // skip_rows / num_rows
   // clang-format off
-  std::vector<std::pair<int, int>> params{ {-1, -1}, {1, 3}, {3, -1}, 
+  std::vector<std::pair<int, int>> params{ {-1, -1}, {1, 3}, {3, -1},
                                            {31, -1}, {32, -1}, {33, -1},
                                            {31, 5}, {32, 5}, {33, 5},
                                            {-1, 7}, {-1, 31}, {-1, 32}, {-1, 33},
diff --git a/cpp/tests/join/conditional_join_tests.cu b/cpp/tests/join/conditional_join_tests.cu
new file mode 100644
index 00000000000..57abdf17aa6
--- /dev/null
+++ b/cpp/tests/join/conditional_join_tests.cu
@@ -0,0 +1,709 @@
+/*
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/ast/nodes.hpp>
+#include <cudf/ast/operators.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/join.hpp>
+#include <cudf/table/table_view.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <thrust/execution_policy.h>
+#include <thrust/pair.h>
+#include <thrust/sort.h>
+
+#include <algorithm>
+#include <random>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+// Defining expressions for AST evaluation is currently a bit tedious, so we
+// define some standard nodes here that can be easily reused elsewhere.
+namespace {
+constexpr cudf::size_type JoinNoneValue =
+  std::numeric_limits<cudf::size_type>::min();  // TODO: how to test if this isn't public?
+
+// Common column references.
+const auto col_ref_left_0  = cudf::ast::column_reference(0, cudf::ast::table_reference::LEFT);
+const auto col_ref_right_0 = cudf::ast::column_reference(0, cudf::ast::table_reference::RIGHT);
+const auto col_ref_left_1  = cudf::ast::column_reference(1, cudf::ast::table_reference::LEFT);
+const auto col_ref_right_1 = cudf::ast::column_reference(1, cudf::ast::table_reference::RIGHT);
+
+// Common expressions.
+auto left_zero_eq_right_zero =
+  cudf::ast::expression(cudf::ast::ast_operator::EQUAL, col_ref_left_0, col_ref_right_0);
+}  // namespace
+
+/**
+ * The principal fixture for all conditional joins.
+ */
+template <typename T>
+struct ConditionalJoinTest : public cudf::test::BaseFixture {
+  /**
+   * Convenience utility for parsing initializer lists of input data into
+   * suitable inputs for tables.
+   */
+  std::tuple<std::vector<cudf::test::fixed_width_column_wrapper<T>>,
+             std::vector<cudf::test::fixed_width_column_wrapper<T>>,
+             std::vector<cudf::column_view>,
+             std::vector<cudf::column_view>,
+             cudf::table_view,
+             cudf::table_view>
+  parse_input(std::vector<std::vector<T>> left_data, std::vector<std::vector<T>> right_data)
+  {
+    // Note that we need to maintain the column wrappers otherwise the
+    // resulting column views will be referencing potentially invalid memory.
+    std::vector<cudf::test::fixed_width_column_wrapper<T>> left_wrappers;
+    std::vector<cudf::test::fixed_width_column_wrapper<T>> right_wrappers;
+
+    std::vector<cudf::column_view> left_columns;
+    std::vector<cudf::column_view> right_columns;
+
+    for (auto v : left_data) {
+      left_wrappers.push_back(cudf::test::fixed_width_column_wrapper<T>(v.begin(), v.end()));
+      left_columns.push_back(left_wrappers.back());
+    }
+
+    for (auto v : right_data) {
+      right_wrappers.push_back(cudf::test::fixed_width_column_wrapper<T>(v.begin(), v.end()));
+      right_columns.push_back(right_wrappers.back());
+    }
+
+    return std::make_tuple(std::move(left_wrappers),
+                           std::move(right_wrappers),
+                           std::move(left_columns),
+                           std::move(right_columns),
+                           cudf::table_view(left_columns),
+                           cudf::table_view(right_columns));
+  }
+
+  std::tuple<std::vector<cudf::test::fixed_width_column_wrapper<T>>,
+             std::vector<cudf::test::fixed_width_column_wrapper<T>>,
+             std::vector<cudf::column_view>,
+             std::vector<cudf::column_view>,
+             cudf::table_view,
+             cudf::table_view>
+  parse_input(std::vector<std::pair<std::vector<T>, std::vector<bool>>> left_data,
+              std::vector<std::pair<std::vector<T>, std::vector<bool>>> right_data)
+  {
+    // Note that we need to maintain the column wrappers otherwise the
+    // resulting column views will be referencing potentially invalid memory.
+    std::vector<cudf::test::fixed_width_column_wrapper<T>> left_wrappers;
+    std::vector<cudf::test::fixed_width_column_wrapper<T>> right_wrappers;
+
+    std::vector<cudf::column_view> left_columns;
+    std::vector<cudf::column_view> right_columns;
+
+    for (auto v : left_data) {
+      left_wrappers.push_back(cudf::test::fixed_width_column_wrapper<T>(
+        v.first.begin(), v.first.end(), v.second.begin()));
+      left_columns.push_back(left_wrappers.back());
+    }
+
+    for (auto v : right_data) {
+      right_wrappers.push_back(cudf::test::fixed_width_column_wrapper<T>(
+        v.first.begin(), v.first.end(), v.second.begin()));
+      right_columns.push_back(right_wrappers.back());
+    }
+
+    return std::make_tuple(std::move(left_wrappers),
+                           std::move(right_wrappers),
+                           std::move(left_columns),
+                           std::move(right_columns),
+                           cudf::table_view(left_columns),
+                           cudf::table_view(right_columns));
+  }
+};
+
+/**
+ * Fixture for join types that return both left and right indices (inner, left,
+ * and full joins).
+ */
+template <typename T>
+struct ConditionalJoinPairReturnTest : public ConditionalJoinTest<T> {
+  /*
+   * Perform a join of tables constructed from two input data sets according to
+   * the provided predicate and verify that the outputs match the expected
+   * outputs (up to order).
+   */
+  void test(std::vector<std::vector<T>> left_data,
+            std::vector<std::vector<T>> right_data,
+            cudf::ast::expression predicate,
+            std::vector<std::pair<cudf::size_type, cudf::size_type>> expected_outputs)
+  {
+    // Note that we need to maintain the column wrappers otherwise the
+    // resulting column views will be referencing potentially invalid memory.
+    auto [left_wrappers, right_wrappers, left_columns, right_columns, left, right] =
+      this->parse_input(left_data, right_data);
+    auto result = this->join(left, right, predicate);
+
+    std::vector<std::pair<cudf::size_type, cudf::size_type>> result_pairs;
+    for (size_t i = 0; i < result.first->size(); ++i) {
+      // Note: Not trying to be terribly efficient here since these tests are
+      // small, otherwise a batch copy to host before constructing the tuples
+      // would be important.
+      result_pairs.push_back({result.first->element(i, rmm::cuda_stream_default),
+                              result.second->element(i, rmm::cuda_stream_default)});
+    }
+    std::sort(result_pairs.begin(), result_pairs.end());
+    std::sort(expected_outputs.begin(), expected_outputs.end());
+
+    EXPECT_TRUE(std::equal(result_pairs.begin(), result_pairs.end(), expected_outputs.begin()));
+  }
+
+  void test_nulls(std::vector<std::pair<std::vector<T>, std::vector<bool>>> left_data,
+                  std::vector<std::pair<std::vector<T>, std::vector<bool>>> right_data,
+                  cudf::ast::expression predicate,
+                  std::vector<std::pair<cudf::size_type, cudf::size_type>> expected_outputs)
+  {
+    // Note that we need to maintain the column wrappers otherwise the
+    // resulting column views will be referencing potentially invalid memory.
+    auto [left_wrappers, right_wrappers, left_columns, right_columns, left, right] =
+      this->parse_input(left_data, right_data);
+    auto result = this->join(left, right, predicate);
+
+    std::vector<std::pair<cudf::size_type, cudf::size_type>> result_pairs;
+    for (size_t i = 0; i < result.first->size(); ++i) {
+      // Note: Not trying to be terribly efficient here since these tests are
+      // small, otherwise a batch copy to host before constructing the tuples
+      // would be important.
+      result_pairs.push_back({result.first->element(i, rmm::cuda_stream_default),
+                              result.second->element(i, rmm::cuda_stream_default)});
+    }
+    std::sort(result_pairs.begin(), result_pairs.end());
+    std::sort(expected_outputs.begin(), expected_outputs.end());
+
+    EXPECT_TRUE(std::equal(result_pairs.begin(), result_pairs.end(), expected_outputs.begin()));
+  }
+
+  /*
+   * Perform a join of tables constructed from two input data sets according to
+   * an equality predicate on all corresponding columns and verify that the outputs match the
+   * expected outputs (up to order).
+   */
+  void compare_to_hash_join(std::vector<std::vector<T>> left_data,
+                            std::vector<std::vector<T>> right_data)
+  {
+    // Note that we need to maintain the column wrappers otherwise the
+    // resulting column views will be referencing potentially invalid memory.
+    auto [left_wrappers, right_wrappers, left_columns, right_columns, left, right] =
+      this->parse_input(left_data, right_data);
+    // TODO: Generalize this to support multiple columns by automatically
+    // constructing the appropriate expression.
+    auto result    = this->join(left, right, left_zero_eq_right_zero);
+    auto reference = this->reference_join(left, right);
+
+    thrust::device_vector<thrust::pair<cudf::size_type, cudf::size_type>> result_pairs(
+      result.first->size());
+    thrust::device_vector<thrust::pair<cudf::size_type, cudf::size_type>> reference_pairs(
+      reference.first->size());
+
+    thrust::transform(thrust::device,
+                      result.first->begin(),
+                      result.first->end(),
+                      result.second->begin(),
+                      result_pairs.begin(),
+                      [] __device__(cudf::size_type first, cudf::size_type second) {
+                        return thrust::make_pair(first, second);
+                      });
+    thrust::transform(thrust::device,
+                      reference.first->begin(),
+                      reference.first->end(),
+                      reference.second->begin(),
+                      reference_pairs.begin(),
+                      [] __device__(cudf::size_type first, cudf::size_type second) {
+                        return thrust::make_pair(first, second);
+                      });
+
+    thrust::sort(thrust::device, result_pairs.begin(), result_pairs.end());
+    thrust::sort(thrust::device, reference_pairs.begin(), reference_pairs.end());
+
+    EXPECT_TRUE(thrust::equal(
+      thrust::device, result_pairs.begin(), result_pairs.end(), reference_pairs.begin()));
+  }
+
+  /**
+   * This method must be implemented by subclasses for specific types of joins.
+   * It should be a simply forwarding of arguments to the appropriate cudf
+   * conditional join API.
+   */
+  virtual std::pair<std::unique_ptr<rmm::device_uvector<cudf::size_type>>,
+                    std::unique_ptr<rmm::device_uvector<cudf::size_type>>>
+  join(cudf::table_view left, cudf::table_view right, cudf::ast::expression predicate) = 0;
+
+  /**
+   * This method must be implemented by subclasses for specific types of joins.
+   * It should be a simply forwarding of arguments to the appropriate cudf
+   * hash join API for comparison with conditional joins.
+   */
+  virtual std::pair<std::unique_ptr<rmm::device_uvector<cudf::size_type>>,
+                    std::unique_ptr<rmm::device_uvector<cudf::size_type>>>
+  reference_join(cudf::table_view left, cudf::table_view right) = 0;
+};
+
+/**
+ * Tests of inner joins.
+ */
+template <typename T>
+struct ConditionalInnerJoinTest : public ConditionalJoinPairReturnTest<T> {
+  std::pair<std::unique_ptr<rmm::device_uvector<cudf::size_type>>,
+            std::unique_ptr<rmm::device_uvector<cudf::size_type>>>
+  join(cudf::table_view left, cudf::table_view right, cudf::ast::expression predicate) override
+  {
+    return cudf::conditional_inner_join(left, right, predicate);
+  }
+
+  std::pair<std::unique_ptr<rmm::device_uvector<cudf::size_type>>,
+            std::unique_ptr<rmm::device_uvector<cudf::size_type>>>
+  reference_join(cudf::table_view left, cudf::table_view right) override
+  {
+    return cudf::inner_join(left, right);
+  }
+};
+
+TYPED_TEST_CASE(ConditionalInnerJoinTest, cudf::test::IntegralTypesNotBool);
+
+TYPED_TEST(ConditionalInnerJoinTest, TestOneColumnOneRowAllEqual)
+{
+  this->test({{0}}, {{0}}, left_zero_eq_right_zero, {{0, 0}});
+};
+
+TYPED_TEST(ConditionalInnerJoinTest, TestOneColumnTwoRowAllEqual)
+{
+  this->test({{0, 1}}, {{0, 0}}, left_zero_eq_right_zero, {{0, 0}, {0, 1}});
+};
+
+TYPED_TEST(ConditionalInnerJoinTest, TestTwoColumnOneRowAllEqual)
+{
+  this->test({{0}, {0}}, {{0}, {0}}, left_zero_eq_right_zero, {{0, 0}});
+};
+
+TYPED_TEST(ConditionalInnerJoinTest, TestTwoColumnThreeRowAllEqual)
+{
+  this->test({{0, 1, 2}, {10, 20, 30}},
+             {{0, 1, 2}, {30, 40, 50}},
+             left_zero_eq_right_zero,
+             {{0, 0}, {1, 1}, {2, 2}});
+};
+
+TYPED_TEST(ConditionalInnerJoinTest, TestTwoColumnThreeRowSomeEqual)
+{
+  this->test({{0, 1, 2}, {10, 20, 30}},
+             {{0, 1, 3}, {30, 40, 50}},
+             left_zero_eq_right_zero,
+             {{0, 0}, {1, 1}});
+};
+
+TYPED_TEST(ConditionalInnerJoinTest, TestNotComparison)
+{
+  auto col_ref_0  = cudf::ast::column_reference(0);
+  auto expression = cudf::ast::expression(cudf::ast::ast_operator::NOT, col_ref_0);
+
+  this->test({{0, 1, 2}}, {{3, 4, 5}}, expression, {{0, 0}, {0, 1}, {0, 2}});
+};
+
+TYPED_TEST(ConditionalInnerJoinTest, TestGreaterComparison)
+{
+  auto col_ref_0  = cudf::ast::column_reference(0);
+  auto col_ref_1  = cudf::ast::column_reference(0, cudf::ast::table_reference::RIGHT);
+  auto expression = cudf::ast::expression(cudf::ast::ast_operator::GREATER, col_ref_0, col_ref_1);
+
+  this->test({{0, 1, 2}}, {{1, 0, 0}}, expression, {{1, 1}, {1, 2}, {2, 0}, {2, 1}, {2, 2}});
+};
+
+TYPED_TEST(ConditionalInnerJoinTest, TestGreaterTwoColumnComparison)
+{
+  auto col_ref_0  = cudf::ast::column_reference(0);
+  auto col_ref_1  = cudf::ast::column_reference(1, cudf::ast::table_reference::RIGHT);
+  auto expression = cudf::ast::expression(cudf::ast::ast_operator::GREATER, col_ref_0, col_ref_1);
+
+  this->test({{0, 1, 2}, {0, 0, 0}},
+             {{0, 0, 0}, {1, 0, 0}},
+             expression,
+             {{1, 1}, {1, 2}, {2, 0}, {2, 1}, {2, 2}});
+};
+
+TYPED_TEST(ConditionalInnerJoinTest, TestGreaterDifferentNumberColumnComparison)
+{
+  auto col_ref_0  = cudf::ast::column_reference(0);
+  auto col_ref_1  = cudf::ast::column_reference(1, cudf::ast::table_reference::RIGHT);
+  auto expression = cudf::ast::expression(cudf::ast::ast_operator::GREATER, col_ref_0, col_ref_1);
+
+  this->test(
+    {{0, 1, 2}}, {{0, 0, 0}, {1, 0, 0}}, expression, {{1, 1}, {1, 2}, {2, 0}, {2, 1}, {2, 2}});
+};
+
+TYPED_TEST(ConditionalInnerJoinTest, TestGreaterDifferentNumberColumnDifferentSizeComparison)
+{
+  auto col_ref_0  = cudf::ast::column_reference(0);
+  auto col_ref_1  = cudf::ast::column_reference(1, cudf::ast::table_reference::RIGHT);
+  auto expression = cudf::ast::expression(cudf::ast::ast_operator::GREATER, col_ref_0, col_ref_1);
+
+  this->test({{0, 1}}, {{0, 0, 0}, {1, 0, 0}}, expression, {{1, 1}, {1, 2}});
+};
+
+TYPED_TEST(ConditionalInnerJoinTest, TestComplexConditionMultipleColumns)
+{
+  // LEFT is implicit, but specifying explicitly to validate that it works.
+  auto col_ref_0      = cudf::ast::column_reference(0, cudf::ast::table_reference::LEFT);
+  auto scalar_1       = cudf::numeric_scalar<TypeParam>(1);
+  auto literal_1      = cudf::ast::literal(scalar_1);
+  auto left_0_equal_1 = cudf::ast::expression(cudf::ast::ast_operator::EQUAL, col_ref_0, literal_1);
+
+  auto col_ref_1 = cudf::ast::column_reference(1, cudf::ast::table_reference::RIGHT);
+  auto comparison_filter =
+    cudf::ast::expression(cudf::ast::ast_operator::LESS, col_ref_1, col_ref_0);
+
+  auto expression =
+    cudf::ast::expression(cudf::ast::ast_operator::LOGICAL_AND, left_0_equal_1, comparison_filter);
+
+  this->test({{0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}},
+             {{0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2},
+              {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11},
+              {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}},
+             expression,
+             {{4, 0}, {5, 0}, {6, 0}, {7, 0}});
+};
+
+TYPED_TEST(ConditionalInnerJoinTest, TestCompareRandomToHash)
+{
+  // Generate columns of 10 repeats of the integer range [0, 10), then merge
+  // a shuffled version and compare to hash join.
+  unsigned int N           = 10000;
+  unsigned int num_repeats = 10;
+  unsigned int num_unique  = N / num_repeats;
+
+  std::vector<TypeParam> left(N);
+  std::vector<TypeParam> right(N);
+
+  for (unsigned int i = 0; i < num_repeats; ++i) {
+    std::iota(
+      std::next(left.begin(), num_unique * i), std::next(left.begin(), num_unique * (i + 1)), 0);
+    std::iota(
+      std::next(right.begin(), num_unique * i), std::next(right.begin(), num_unique * (i + 1)), 0);
+  }
+
+  std::random_device rd;
+  std::mt19937 gen(rd());
+  std::shuffle(left.begin(), left.end(), gen);
+  std::shuffle(right.begin(), right.end(), gen);
+
+  this->compare_to_hash_join({left}, {right});
+};
+
+TYPED_TEST(ConditionalInnerJoinTest, TestOneColumnTwoNullsRowAllEqual)
+{
+  this->test_nulls(
+    {{{0, 1}, {1, 0}}}, {{{0, 0}, {1, 1}}}, left_zero_eq_right_zero, {{0, 0}, {0, 1}});
+};
+
+TYPED_TEST(ConditionalInnerJoinTest, TestOneColumnTwoNullsNoOutputRowAllEqual)
+{
+  this->test_nulls({{{0, 1}, {0, 1}}}, {{{0, 0}, {1, 1}}}, left_zero_eq_right_zero, {{}, {}});
+};
+
+/**
+ * Tests of left joins.
+ */
+template <typename T>
+struct ConditionalLeftJoinTest : public ConditionalJoinPairReturnTest<T> {
+  std::pair<std::unique_ptr<rmm::device_uvector<cudf::size_type>>,
+            std::unique_ptr<rmm::device_uvector<cudf::size_type>>>
+  join(cudf::table_view left, cudf::table_view right, cudf::ast::expression predicate) override
+  {
+    return cudf::conditional_left_join(left, right, predicate);
+  }
+
+  std::pair<std::unique_ptr<rmm::device_uvector<cudf::size_type>>,
+            std::unique_ptr<rmm::device_uvector<cudf::size_type>>>
+  reference_join(cudf::table_view left, cudf::table_view right) override
+  {
+    return cudf::left_join(left, right);
+  }
+};
+
+TYPED_TEST_CASE(ConditionalLeftJoinTest, cudf::test::IntegralTypesNotBool);
+
+TYPED_TEST(ConditionalLeftJoinTest, TestTwoColumnThreeRowSomeEqual)
+{
+  this->test({{0, 1, 2}, {10, 20, 30}},
+             {{0, 1, 3}, {30, 40, 50}},
+             left_zero_eq_right_zero,
+             {{0, 0}, {1, 1}, {2, JoinNoneValue}});
+};
+
+TYPED_TEST(ConditionalLeftJoinTest, TestCompareRandomToHash)
+{
+  // Generate columns of 10 repeats of the integer range [0, 10), then merge
+  // a shuffled version and compare to hash join.
+  unsigned int N           = 10000;
+  unsigned int num_repeats = 10;
+  unsigned int num_unique  = N / num_repeats;
+
+  std::vector<TypeParam> left(N);
+  std::vector<TypeParam> right(N);
+
+  for (unsigned int i = 0; i < num_repeats; ++i) {
+    std::iota(
+      std::next(left.begin(), num_unique * i), std::next(left.begin(), num_unique * (i + 1)), 0);
+    std::iota(
+      std::next(right.begin(), num_unique * i), std::next(right.begin(), num_unique * (i + 1)), 0);
+  }
+
+  std::random_device rd;
+  std::mt19937 gen(rd());
+  std::shuffle(left.begin(), left.end(), gen);
+  std::shuffle(right.begin(), right.end(), gen);
+
+  this->compare_to_hash_join({left}, {right});
+};
+
+/**
+ * Tests of full joins.
+ */
+template <typename T>
+struct ConditionalFullJoinTest : public ConditionalJoinPairReturnTest<T> {
+  std::pair<std::unique_ptr<rmm::device_uvector<cudf::size_type>>,
+            std::unique_ptr<rmm::device_uvector<cudf::size_type>>>
+  join(cudf::table_view left, cudf::table_view right, cudf::ast::expression predicate) override
+  {
+    return cudf::conditional_full_join(left, right, predicate);
+  }
+
+  std::pair<std::unique_ptr<rmm::device_uvector<cudf::size_type>>,
+            std::unique_ptr<rmm::device_uvector<cudf::size_type>>>
+  reference_join(cudf::table_view left, cudf::table_view right) override
+  {
+    return cudf::full_join(left, right);
+  }
+};
+
+TYPED_TEST_CASE(ConditionalFullJoinTest, cudf::test::IntegralTypesNotBool);
+
+TYPED_TEST(ConditionalFullJoinTest, TestTwoColumnThreeRowSomeEqual)
+{
+  this->test({{0, 1, 2}, {10, 20, 30}},
+             {{0, 1, 3}, {30, 40, 50}},
+             left_zero_eq_right_zero,
+             {{0, 0}, {1, 1}, {2, JoinNoneValue}, {JoinNoneValue, 2}});
+};
+
+TYPED_TEST(ConditionalFullJoinTest, TestCompareRandomToHash)
+{
+  // Generate columns of 10 repeats of the integer range [0, 10), then merge
+  // a shuffled version and compare to hash join.
+  unsigned int N           = 10000;
+  unsigned int num_repeats = 10;
+  unsigned int num_unique  = N / num_repeats;
+
+  std::vector<TypeParam> left(N);
+  std::vector<TypeParam> right(N);
+
+  for (unsigned int i = 0; i < num_repeats; ++i) {
+    std::iota(
+      std::next(left.begin(), num_unique * i), std::next(left.begin(), num_unique * (i + 1)), 0);
+    std::iota(
+      std::next(right.begin(), num_unique * i), std::next(right.begin(), num_unique * (i + 1)), 0);
+  }
+
+  std::random_device rd;
+  std::mt19937 gen(rd());
+  std::shuffle(left.begin(), left.end(), gen);
+  std::shuffle(right.begin(), right.end(), gen);
+
+  this->compare_to_hash_join({left}, {right});
+};
+
+/**
+ * Fixture for join types that return both only left indices (left semi and
+ * left anti).
+ */
+template <typename T>
+struct ConditionalJoinSingleReturnTest : public ConditionalJoinTest<T> {
+  /*
+   * Perform a join of tables constructed from two input data sets according to
+   * the provided predicate and verify that the outputs match the expected
+   * outputs (up to order).
+   */
+  void test(std::vector<std::vector<T>> left_data,
+            std::vector<std::vector<T>> right_data,
+            cudf::ast::expression predicate,
+            std::vector<cudf::size_type> expected_outputs)
+  {
+    auto [left_wrappers, right_wrappers, left_columns, right_columns, left, right] =
+      this->parse_input(left_data, right_data);
+    auto result = this->join(left, right, predicate);
+
+    std::vector<cudf::size_type> resulting_indices;
+    for (size_t i = 0; i < result->size(); ++i) {
+      // Note: Not trying to be terribly efficient here since these tests are
+      // small, otherwise a batch copy to host before constructing the tuples
+      // would be important.
+      resulting_indices.push_back(result->element(i, rmm::cuda_stream_default));
+    }
+    std::sort(resulting_indices.begin(), resulting_indices.end());
+    std::sort(expected_outputs.begin(), expected_outputs.end());
+    EXPECT_TRUE(
+      std::equal(resulting_indices.begin(), resulting_indices.end(), expected_outputs.begin()));
+  }
+
+  /*
+   * Perform a join of tables constructed from two input data sets according to
+   * an equality predicate on all corresponding columns and verify that the outputs match the
+   * expected outputs (up to order).
+   */
+  void compare_to_hash_join(std::vector<std::vector<T>> left_data,
+                            std::vector<std::vector<T>> right_data)
+  {
+    // Note that we need to maintain the column wrappers otherwise the
+    // resulting column views will be referencing potentially invalid memory.
+    auto [left_wrappers, right_wrappers, left_columns, right_columns, left, right] =
+      this->parse_input(left_data, right_data);
+    // TODO: Generalize this to support multiple columns by automatically
+    // constructing the appropriate expression.
+    auto result    = this->join(left, right, left_zero_eq_right_zero);
+    auto reference = this->reference_join(left, right);
+
+    thrust::sort(thrust::device, result->begin(), result->end());
+    thrust::sort(thrust::device, reference->begin(), reference->end());
+
+    EXPECT_TRUE(thrust::equal(thrust::device, result->begin(), result->end(), reference->begin()));
+  }
+
+  /**
+   * This method must be implemented by subclasses for specific types of joins.
+   * It should be a simply forwarding of arguments to the appropriate cudf
+   * conditional join API.
+   */
+  virtual std::unique_ptr<rmm::device_uvector<cudf::size_type>> join(
+    cudf::table_view left, cudf::table_view right, cudf::ast::expression predicate) = 0;
+
+  /**
+   * This method must be implemented by subclasses for specific types of joins.
+   * It should be a simply forwarding of arguments to the appropriate cudf
+   * hash join API for comparison with conditional joins.
+   */
+  virtual std::unique_ptr<rmm::device_uvector<cudf::size_type>> reference_join(
+    cudf::table_view left, cudf::table_view right) = 0;
+};
+
+/**
+ * Tests of left semi joins.
+ */
+template <typename T>
+struct ConditionalLeftSemiJoinTest : public ConditionalJoinSingleReturnTest<T> {
+  std::unique_ptr<rmm::device_uvector<cudf::size_type>> join(
+    cudf::table_view left, cudf::table_view right, cudf::ast::expression predicate) override
+  {
+    return cudf::conditional_left_semi_join(left, right, predicate);
+  }
+
+  std::unique_ptr<rmm::device_uvector<cudf::size_type>> reference_join(
+    cudf::table_view left, cudf::table_view right) override
+  {
+    return cudf::left_semi_join(left, right);
+  }
+};
+
+TYPED_TEST_CASE(ConditionalLeftSemiJoinTest, cudf::test::IntegralTypesNotBool);
+
+TYPED_TEST(ConditionalLeftSemiJoinTest, TestTwoColumnThreeRowSomeEqual)
+{
+  this->test({{0, 1, 2}, {10, 20, 30}}, {{0, 1, 3}, {30, 40, 50}}, left_zero_eq_right_zero, {0, 1});
+};
+
+TYPED_TEST(ConditionalLeftSemiJoinTest, TestCompareRandomToHash)
+{
+  // Generate columns of 10 repeats of the integer range [0, 10), then merge
+  // a shuffled version and compare to hash join.
+  unsigned int N           = 10000;
+  unsigned int num_repeats = 10;
+  unsigned int num_unique  = N / num_repeats;
+
+  std::vector<TypeParam> left(N);
+  std::vector<TypeParam> right(N);
+
+  for (unsigned int i = 0; i < num_repeats; ++i) {
+    std::iota(
+      std::next(left.begin(), num_unique * i), std::next(left.begin(), num_unique * (i + 1)), 0);
+    std::iota(
+      std::next(right.begin(), num_unique * i), std::next(right.begin(), num_unique * (i + 1)), 0);
+  }
+
+  std::random_device rd;
+  std::mt19937 gen(rd());
+  std::shuffle(left.begin(), left.end(), gen);
+  std::shuffle(right.begin(), right.end(), gen);
+
+  this->compare_to_hash_join({left}, {right});
+};
+
+/**
+ * Tests of left anti joins.
+ */
+template <typename T>
+struct ConditionalLeftAntiJoinTest : public ConditionalJoinSingleReturnTest<T> {
+  std::unique_ptr<rmm::device_uvector<cudf::size_type>> join(
+    cudf::table_view left, cudf::table_view right, cudf::ast::expression predicate) override
+  {
+    return cudf::conditional_left_anti_join(left, right, predicate);
+  }
+
+  std::unique_ptr<rmm::device_uvector<cudf::size_type>> reference_join(
+    cudf::table_view left, cudf::table_view right) override
+  {
+    return cudf::left_anti_join(left, right);
+  }
+};
+
+TYPED_TEST_CASE(ConditionalLeftAntiJoinTest, cudf::test::IntegralTypesNotBool);
+
+TYPED_TEST(ConditionalLeftAntiJoinTest, TestTwoColumnThreeRowSomeEqual)
+{
+  this->test({{0, 1, 2}, {10, 20, 30}}, {{0, 1, 3}, {30, 40, 50}}, left_zero_eq_right_zero, {2});
+};
+
+TYPED_TEST(ConditionalLeftAntiJoinTest, TestCompareRandomToHash)
+{
+  // Generate columns of 10 repeats of the integer range [0, 10), then merge
+  // a shuffled version and compare to hash join.
+  unsigned int N           = 10000;
+  unsigned int num_repeats = 10;
+  unsigned int num_unique  = N / num_repeats;
+
+  std::vector<TypeParam> left(N);
+  std::vector<TypeParam> right(N);
+
+  for (unsigned int i = 0; i < num_repeats; ++i) {
+    std::iota(
+      std::next(left.begin(), num_unique * i), std::next(left.begin(), num_unique * (i + 1)), 0);
+    std::iota(
+      std::next(right.begin(), num_unique * i), std::next(right.begin(), num_unique * (i + 1)), 0);
+  }
+
+  std::random_device rd;
+  std::mt19937 gen(rd());
+  std::shuffle(left.begin(), left.end(), gen);
+  std::shuffle(right.begin(), right.end(), gen);
+
+  this->compare_to_hash_join({left}, {right});
+};
diff --git a/cpp/tests/join/join_tests.cpp b/cpp/tests/join/join_tests.cpp
index 212458d5118..e468368842a 100644
--- a/cpp/tests/join/join_tests.cpp
+++ b/cpp/tests/join/join_tests.cpp
@@ -265,7 +265,7 @@ TEST_F(JoinTest, FullJoinOnNulls)
   cols_gold.push_back(col_gold_3.release());
   cols_gold.push_back(col_gold_4.release());
   cols_gold.push_back(col_gold_5.release());
-  
+
   Table gold(std::move(cols_gold));
 
   auto gold_sort_order = cudf::sorted_order(gold.view());
@@ -549,7 +549,7 @@ TEST_F(JoinTest, LeftJoinOnNulls)
                                      {   1,     1,    0});
   column_wrapper<int32_t> col_gold_5{{   2,     8,   -1},
                                      {   1,     1,    0}};
-  
+
   CVector cols_gold;
   cols_gold.push_back(col_gold_0.release());
   cols_gold.push_back(col_gold_1.release());
@@ -579,7 +579,7 @@ TEST_F(JoinTest, LeftJoinOnNulls)
   result_sort_order = cudf::sorted_order(result->view());
   sorted_result     = cudf::gather(result->view(), *result_sort_order);
 
-  
+
   col_gold_0 = {{   3,    -1,    2},
                 {   1,     0,    1}};
   col_gold_1 = {{ "s0",  "s1", "s2"},
@@ -782,7 +782,7 @@ TEST_F(JoinTest, InnerJoinWithStructsAndNulls)
   CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result);
 }
 
-// // Test to check join behaviour when join keys are null.
+// // Test to check join behavior when join keys are null.
 TEST_F(JoinTest, InnerJoinOnNulls)
 {
   // clang-format off
@@ -826,7 +826,7 @@ TEST_F(JoinTest, InnerJoinOnNulls)
   cols_gold.push_back(col_gold_3.release());
   cols_gold.push_back(col_gold_4.release());
   cols_gold.push_back(col_gold_5.release());
-  
+
   Table gold(std::move(cols_gold));
 
   auto gold_sort_order = cudf::sorted_order(gold.view());
diff --git a/cpp/tests/reductions/reduction_tests.cpp b/cpp/tests/reductions/reduction_tests.cpp
index 1d3782daac9..95bacd87931 100644
--- a/cpp/tests/reductions/reduction_tests.cpp
+++ b/cpp/tests/reductions/reduction_tests.cpp
@@ -36,7 +36,7 @@ using aggregation = cudf::aggregation;
 
 template <typename T>
 typename std::enable_if<!cudf::is_timestamp_t<T>::value, std::vector<T>>::type convert_values(
-  std::vector<int> const &int_values)
+  std::vector<int> const& int_values)
 {
   std::vector<T> v(int_values.size());
   std::transform(int_values.begin(), int_values.end(), v.begin(), [](int x) {
@@ -48,7 +48,7 @@ typename std::enable_if<!cudf::is_timestamp_t<T>::value, std::vector<T>>::type c
 
 template <typename T>
 typename std::enable_if<cudf::is_timestamp_t<T>::value, std::vector<T>>::type convert_values(
-  std::vector<int> const &int_values)
+  std::vector<int> const& int_values)
 {
   std::vector<T> v(int_values.size());
   std::transform(int_values.begin(), int_values.end(), v.begin(), [](int x) {
@@ -59,16 +59,16 @@ typename std::enable_if<cudf::is_timestamp_t<T>::value, std::vector<T>>::type co
 }
 
 template <typename T>
-cudf::test::fixed_width_column_wrapper<T> construct_null_column(std::vector<T> const &values,
-                                                                std::vector<bool> const &bools)
+cudf::test::fixed_width_column_wrapper<T> construct_null_column(std::vector<T> const& values,
+                                                                std::vector<bool> const& bools)
 {
   if (values.size() > bools.size()) { throw std::logic_error("input vector size mismatch."); }
   return cudf::test::fixed_width_column_wrapper<T>(values.begin(), values.end(), bools.begin());
 }
 
 template <typename T>
-std::vector<T> replace_nulls(std::vector<T> const &values,
-                             std::vector<bool> const &bools,
+std::vector<T> replace_nulls(std::vector<T> const& values,
+                             std::vector<bool> const& bools,
                              T identity)
 {
   std::vector<T> v(values.size());
@@ -95,7 +95,7 @@ struct ReductionTest : public cudf::test::BaseFixture {
   void reduction_test(const cudf::column_view underlying_column,
                       T_out expected_value,
                       bool succeeded_condition,
-                      std::unique_ptr<aggregation> const &agg,
+                      std::unique_ptr<aggregation> const& agg,
                       cudf::data_type output_dtype = cudf::data_type{},
                       bool expected_null           = false)
   {
@@ -104,7 +104,7 @@ struct ReductionTest : public cudf::test::BaseFixture {
     auto statement = [&]() {
       std::unique_ptr<cudf::scalar> result = cudf::reduce(underlying_column, agg, output_dtype);
       using ScalarType                     = cudf::scalar_type_t<T_out>;
-      auto result1                         = static_cast<ScalarType *>(result.get());
+      auto result1                         = static_cast<ScalarType*>(result.get());
       EXPECT_EQ(expected_null, !result1->is_valid());
       if (result1->is_valid()) { EXPECT_EQ(expected_value, result1->value()); }
     };
@@ -148,8 +148,8 @@ TYPED_TEST(MinMaxReductionTest, MinMax)
   auto res = cudf::minmax(col);
 
   using ScalarType = cudf::scalar_type_t<T>;
-  auto min_result  = static_cast<ScalarType *>(res.first.get());
-  auto max_result  = static_cast<ScalarType *>(res.second.get());
+  auto min_result  = static_cast<ScalarType*>(res.first.get());
+  auto max_result  = static_cast<ScalarType*>(res.second.get());
   EXPECT_EQ(min_result->value(), expected_min_result);
   EXPECT_EQ(max_result->value(), expected_max_result);
 
@@ -170,8 +170,8 @@ TYPED_TEST(MinMaxReductionTest, MinMax)
   auto null_res = cudf::minmax(col_nulls);
 
   using ScalarType     = cudf::scalar_type_t<T>;
-  auto min_null_result = static_cast<ScalarType *>(null_res.first.get());
-  auto max_null_result = static_cast<ScalarType *>(null_res.second.get());
+  auto min_null_result = static_cast<ScalarType*>(null_res.first.get());
+  auto max_null_result = static_cast<ScalarType*>(null_res.second.get());
   EXPECT_EQ(min_null_result->value(), expected_min_null_result);
   EXPECT_EQ(max_null_result->value(), expected_max_null_result);
 
@@ -202,8 +202,8 @@ TYPED_TEST(MinMaxReductionTest, MinMax)
   auto all_null_res = cudf::minmax(col_all_nulls);
 
   using ScalarType         = cudf::scalar_type_t<T>;
-  auto min_all_null_result = static_cast<ScalarType *>(all_null_res.first.get());
-  auto max_all_null_result = static_cast<ScalarType *>(all_null_res.second.get());
+  auto min_all_null_result = static_cast<ScalarType*>(all_null_res.first.get());
+  auto max_all_null_result = static_cast<ScalarType*>(all_null_res.second.get());
   EXPECT_EQ(min_all_null_result->is_valid(), false);
   EXPECT_EQ(max_all_null_result->is_valid(), false);
 }
@@ -244,7 +244,7 @@ TYPED_TEST(ReductionTest, Product)
   std::vector<bool> host_bools({1, 1, 0, 0, 1, 1, 1});
   std::vector<TypeParam> v = convert_values<TypeParam>(int_values);
 
-  auto calc_prod = [](std::vector<T> &v) {
+  auto calc_prod = [](std::vector<T>& v) {
     T expected_value =
       std::accumulate(v.begin(), v.end(), T{1}, [](T acc, T i) { return acc * i; });
     return expected_value;
@@ -273,7 +273,7 @@ TYPED_TEST(ReductionTest, SumOfSquare)
   std::vector<bool> host_bools({1, 1, 0, 0, 1, 1, 1, 1});
   std::vector<T> v = convert_values<T>(int_values);
 
-  auto calc_reduction = [](std::vector<T> &v) {
+  auto calc_reduction = [](std::vector<T>& v) {
     T value = std::accumulate(v.begin(), v.end(), T{0}, [](T acc, T i) { return acc + i * i; });
     return value;
   };
@@ -373,7 +373,7 @@ TYPED_TEST(MultiStepReductionTest, Mean)
   std::vector<int> int_values({-3, 2, 1, 0, 5, -3, -2, 28});
   std::vector<bool> host_bools({1, 1, 0, 1, 1, 1, 0, 1});
 
-  auto calc_mean = [](std::vector<T> &v, cudf::size_type valid_count) {
+  auto calc_mean = [](std::vector<T>& v, cudf::size_type valid_count) {
     double sum = std::accumulate(v.begin(), v.end(), double{0});
     return sum / valid_count;
   };
@@ -414,7 +414,7 @@ TYPED_TEST(MultiStepReductionTest, DISABLED_var_std)
   std::vector<int> int_values({-3, 2, 1, 0, 5, -3, -2, 28});
   std::vector<bool> host_bools({1, 1, 0, 1, 1, 1, 0, 1});
 
-  auto calc_var = [](std::vector<T> &v, cudf::size_type valid_count) {
+  auto calc_var = [](std::vector<T>& v, cudf::size_type valid_count) {
     double mean = std::accumulate(v.begin(), v.end(), double{0});
     mean /= valid_count;
 
@@ -459,9 +459,9 @@ TYPED_TEST(MultiStepReductionTest, DISABLED_var_std)
 
 template <typename T>
 struct ReductionMultiStepErrorCheck : public ReductionTest<T> {
-  void reduction_error_check(cudf::test::fixed_width_column_wrapper<T> &col,
+  void reduction_error_check(cudf::test::fixed_width_column_wrapper<T>& col,
                              bool succeeded_condition,
-                             std::unique_ptr<aggregation> const &agg,
+                             std::unique_ptr<aggregation> const& agg,
                              cudf::data_type output_dtype)
   {
     const cudf::column_view underlying_column = col;
@@ -528,10 +528,10 @@ TYPED_TEST(ReductionMultiStepErrorCheck, DISABLED_ErrorHandling)
 
 struct ReductionDtypeTest : public cudf::test::BaseFixture {
   template <typename T_in, typename T_out>
-  void reduction_test(std::vector<int> &int_values,
+  void reduction_test(std::vector<int>& int_values,
                       T_out expected_value,
                       bool succeeded_condition,
-                      std::unique_ptr<aggregation> const &agg,
+                      std::unique_ptr<aggregation> const& agg,
                       cudf::data_type out_dtype,
                       bool expected_overflow = false)
   {
@@ -542,7 +542,7 @@ struct ReductionDtypeTest : public cudf::test::BaseFixture {
     auto statement = [&]() {
       std::unique_ptr<cudf::scalar> result = cudf::reduce(col, agg, out_dtype);
       using ScalarType                     = cudf::scalar_type_t<T_out>;
-      auto result1                         = static_cast<ScalarType *>(result.get());
+      auto result1                         = static_cast<ScalarType*>(result.get());
       if (result1->is_valid() && !expected_overflow) {
         EXPECT_EQ(expected_value, result1->value());
       }
@@ -724,7 +724,7 @@ TEST_P(ReductionParamTest, DISABLED_std_var)
   std::vector<double> int_values({-3, 2, 1, 0, 5, -3, -2, 28});
   std::vector<bool> host_bools({1, 1, 0, 1, 1, 1, 0, 1});
 
-  auto calc_var = [ddof](std::vector<double> &v, cudf::size_type valid_count) {
+  auto calc_var = [ddof](std::vector<double>& v, cudf::size_type valid_count) {
     double mean = std::accumulate(v.begin(), v.end(), double{0});
     mean /= valid_count;
 
@@ -772,7 +772,7 @@ struct StringReductionTest : public cudf::test::BaseFixture,
   void reduction_test(const cudf::column_view underlying_column,
                       std::string expected_value,
                       bool succeeded_condition,
-                      std::unique_ptr<aggregation> const &agg,
+                      std::unique_ptr<aggregation> const& agg,
                       cudf::data_type output_dtype = cudf::data_type{})
   {
     if (cudf::data_type{} == output_dtype) output_dtype = underlying_column.type();
@@ -780,7 +780,7 @@ struct StringReductionTest : public cudf::test::BaseFixture,
     auto statement = [&]() {
       std::unique_ptr<cudf::scalar> result = cudf::reduce(underlying_column, agg, output_dtype);
       using ScalarType                     = cudf::scalar_type_t<cudf::string_view>;
-      auto result1                         = static_cast<ScalarType *>(result.get());
+      auto result1                         = static_cast<ScalarType*>(result.get());
       EXPECT_TRUE(result1->is_valid());
       if (!result1->is_valid())
         std::cout << "expected=" << expected_value << ",got=" << result1->to_string() << std::endl;
@@ -844,14 +844,14 @@ TEST_P(StringReductionTest, MinMax)
 
   // MINMAX
   auto result = cudf::minmax(col);
-  EXPECT_EQ(static_cast<cudf::string_scalar *>(result.first.get())->to_string(),
+  EXPECT_EQ(static_cast<cudf::string_scalar*>(result.first.get())->to_string(),
             expected_min_result);
-  EXPECT_EQ(static_cast<cudf::string_scalar *>(result.second.get())->to_string(),
+  EXPECT_EQ(static_cast<cudf::string_scalar*>(result.second.get())->to_string(),
             expected_max_result);
   result = cudf::minmax(col_nulls);
-  EXPECT_EQ(static_cast<cudf::string_scalar *>(result.first.get())->to_string(),
+  EXPECT_EQ(static_cast<cudf::string_scalar*>(result.first.get())->to_string(),
             expected_min_null_result);
-  EXPECT_EQ(static_cast<cudf::string_scalar *>(result.second.get())->to_string(),
+  EXPECT_EQ(static_cast<cudf::string_scalar*>(result.second.get())->to_string(),
             expected_max_null_result);
 }
 
@@ -865,9 +865,9 @@ TEST_P(StringReductionTest, DictionaryMinMax)
   std::string expected_max_result = *(std::max_element(host_strings.begin(), host_strings.end()));
 
   auto result = cudf::minmax(col);
-  EXPECT_EQ(static_cast<cudf::string_scalar *>(result.first.get())->to_string(),
+  EXPECT_EQ(static_cast<cudf::string_scalar*>(result.first.get())->to_string(),
             expected_min_result);
-  EXPECT_EQ(static_cast<cudf::string_scalar *>(result.second.get())->to_string(),
+  EXPECT_EQ(static_cast<cudf::string_scalar*>(result.second.get())->to_string(),
             expected_max_result);
 
   // column with nulls
@@ -885,9 +885,9 @@ TEST_P(StringReductionTest, DictionaryMinMax)
   expected_max_result = *(std::max_element(r_strings.begin(), r_strings.end()));
 
   result = cudf::minmax(col_nulls);
-  EXPECT_EQ(static_cast<cudf::string_scalar *>(result.first.get())->to_string(),
+  EXPECT_EQ(static_cast<cudf::string_scalar*>(result.first.get())->to_string(),
             expected_min_result);
-  EXPECT_EQ(static_cast<cudf::string_scalar *>(result.second.get())->to_string(),
+  EXPECT_EQ(static_cast<cudf::string_scalar*>(result.second.get())->to_string(),
             expected_max_result);
 
   // test sliced column
@@ -895,9 +895,9 @@ TEST_P(StringReductionTest, DictionaryMinMax)
   // 3->2 and 7->5 because r_strings contains no null entries
   expected_min_result = *(std::min_element(r_strings.begin() + 2, r_strings.begin() + 5));
   expected_max_result = *(std::max_element(r_strings.begin() + 2, r_strings.begin() + 5));
-  EXPECT_EQ(static_cast<cudf::string_scalar *>(result.first.get())->to_string(),
+  EXPECT_EQ(static_cast<cudf::string_scalar*>(result.first.get())->to_string(),
             expected_min_result);
-  EXPECT_EQ(static_cast<cudf::string_scalar *>(result.second.get())->to_string(),
+  EXPECT_EQ(static_cast<cudf::string_scalar*>(result.second.get())->to_string(),
             expected_max_result);
 }
 
@@ -1063,7 +1063,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointReductionProductZeroScale)
   auto const out_type = static_cast<cudf::column_view>(column).type();
 
   auto const result        = cudf::reduce(column, cudf::make_product_aggregation(), out_type);
-  auto const result_scalar = static_cast<cudf::scalar_type_t<decimalXX> *>(result.get());
+  auto const result_scalar = static_cast<cudf::scalar_type_t<decimalXX>*>(result.get());
   auto const result_fp     = decimalXX{result_scalar->value()};
 
   EXPECT_EQ(result_fp, expected);
@@ -1084,7 +1084,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointReductionProduct)
     auto const expected = decimalXX{scaled_integer<RepType>{36, scale_type{i * 6}}};
 
     auto const result        = cudf::reduce(column, cudf::make_product_aggregation(), out_type);
-    auto const result_scalar = static_cast<cudf::scalar_type_t<decimalXX> *>(result.get());
+    auto const result_scalar = static_cast<cudf::scalar_type_t<decimalXX>*>(result.get());
 
     EXPECT_EQ(result_scalar->fixed_point_value(), expected);
   }
@@ -1104,7 +1104,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointReductionProductWithNulls)
     auto const expected = decimalXX{scaled_integer<RepType>{6, scale_type{i * 3}}};
 
     auto const result        = cudf::reduce(column, cudf::make_product_aggregation(), out_type);
-    auto const result_scalar = static_cast<cudf::scalar_type_t<decimalXX> *>(result.get());
+    auto const result_scalar = static_cast<cudf::scalar_type_t<decimalXX>*>(result.get());
 
     EXPECT_EQ(result_scalar->fixed_point_value(), expected);
   }
@@ -1125,7 +1125,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointReductionSum)
     auto const out_type = static_cast<cudf::column_view>(column).type();
 
     auto const result        = cudf::reduce(column, cudf::make_sum_aggregation(), out_type);
-    auto const result_scalar = static_cast<cudf::scalar_type_t<decimalXX> *>(result.get());
+    auto const result_scalar = static_cast<cudf::scalar_type_t<decimalXX>*>(result.get());
 
     EXPECT_EQ(result_scalar->fixed_point_value(), expected);
   }
@@ -1149,7 +1149,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointReductionSumAlternate)
   auto const out_type = static_cast<cudf::column_view>(column).type();
 
   auto const result        = cudf::reduce(column, cudf::make_sum_aggregation(), out_type);
-  auto const result_scalar = static_cast<cudf::scalar_type_t<decimalXX> *>(result.get());
+  auto const result_scalar = static_cast<cudf::scalar_type_t<decimalXX>*>(result.get());
 
   EXPECT_EQ(result_scalar->fixed_point_value(), expected);
   EXPECT_EQ(result_scalar->fixed_point_value(), TEN);
@@ -1169,7 +1169,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointReductionSumFractional)
     auto const expected = decimalXX{scaled_integer<RepType>{666, scale}};
 
     auto const result        = cudf::reduce(column, cudf::make_sum_aggregation(), out_type);
-    auto const result_scalar = static_cast<cudf::scalar_type_t<decimalXX> *>(result.get());
+    auto const result_scalar = static_cast<cudf::scalar_type_t<decimalXX>*>(result.get());
 
     EXPECT_EQ(result_scalar->fixed_point_value(), expected);
   }
@@ -1192,7 +1192,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointReductionSumLarge)
     auto const expected       = decimalXX{scaled_integer<RepType>{expected_value, scale}};
 
     auto const result        = cudf::reduce(column, cudf::make_sum_aggregation(), out_type);
-    auto const result_scalar = static_cast<cudf::scalar_type_t<decimalXX> *>(result.get());
+    auto const result_scalar = static_cast<cudf::scalar_type_t<decimalXX>*>(result.get());
 
     EXPECT_EQ(result_scalar->fixed_point_value(), expected);
   }
@@ -1212,7 +1212,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointReductionMin)
     auto const out_type = static_cast<cudf::column_view>(column).type();
 
     auto const result        = cudf::reduce(column, cudf::make_min_aggregation(), out_type);
-    auto const result_scalar = static_cast<cudf::scalar_type_t<decimalXX> *>(result.get());
+    auto const result_scalar = static_cast<cudf::scalar_type_t<decimalXX>*>(result.get());
 
     EXPECT_EQ(result_scalar->fixed_point_value(), ONE);
   }
@@ -1233,7 +1233,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointReductionMinLarge)
     auto const expected = decimalXX{0, scale};
 
     auto const result        = cudf::reduce(column, cudf::make_min_aggregation(), out_type);
-    auto const result_scalar = static_cast<cudf::scalar_type_t<decimalXX> *>(result.get());
+    auto const result_scalar = static_cast<cudf::scalar_type_t<decimalXX>*>(result.get());
 
     EXPECT_EQ(result_scalar->fixed_point_value(), expected);
   }
@@ -1253,7 +1253,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointReductionMax)
     auto const out_type = static_cast<cudf::column_view>(column).type();
 
     auto const result        = cudf::reduce(column, cudf::make_max_aggregation(), out_type);
-    auto const result_scalar = static_cast<cudf::scalar_type_t<decimalXX> *>(result.get());
+    auto const result_scalar = static_cast<cudf::scalar_type_t<decimalXX>*>(result.get());
 
     EXPECT_EQ(result_scalar->fixed_point_value(), FOUR);
   }
@@ -1274,7 +1274,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointReductionMaxLarge)
     auto const expected = decimalXX{scaled_integer<RepType>{42, scale}};
 
     auto const result        = cudf::reduce(column, cudf::make_max_aggregation(), out_type);
-    auto const result_scalar = static_cast<cudf::scalar_type_t<decimalXX> *>(result.get());
+    auto const result_scalar = static_cast<cudf::scalar_type_t<decimalXX>*>(result.get());
 
     EXPECT_EQ(result_scalar->fixed_point_value(), expected);
   }
@@ -1293,7 +1293,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointReductionNUnique)
     auto const out_type = static_cast<cudf::column_view>(column).type();
 
     auto const result        = cudf::reduce(column, cudf::make_nunique_aggregation(), out_type);
-    auto const result_scalar = static_cast<cudf::scalar_type_t<cudf::size_type> *>(result.get());
+    auto const result_scalar = static_cast<cudf::scalar_type_t<cudf::size_type>*>(result.get());
 
     EXPECT_EQ(result_scalar->value(), 4);
   }
@@ -1313,7 +1313,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointReductionSumOfSquares)
     auto const expected = decimalXX{scaled_integer<RepType>{30, scale_type{i * 2}}};
 
     auto const result = cudf::reduce(column, cudf::make_sum_of_squares_aggregation(), out_type);
-    auto const result_scalar = static_cast<cudf::scalar_type_t<decimalXX> *>(result.get());
+    auto const result_scalar = static_cast<cudf::scalar_type_t<decimalXX>*>(result.get());
 
     EXPECT_EQ(result_scalar->fixed_point_value(), expected);
   }
@@ -1333,7 +1333,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointReductionMedianOddNumberOfElements)
     auto const expected = decimalXX{scaled_integer<RepType>{2, scale}};
 
     auto const result        = cudf::reduce(column, cudf::make_median_aggregation(), out_type);
-    auto const result_scalar = static_cast<cudf::scalar_type_t<decimalXX> *>(result.get());
+    auto const result_scalar = static_cast<cudf::scalar_type_t<decimalXX>*>(result.get());
 
     EXPECT_EQ(result_scalar->fixed_point_value(), expected);
   }
@@ -1353,7 +1353,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointReductionMedianEvenNumberOfElements
     auto const expected = decimalXX{scaled_integer<RepType>{25, scale}};
 
     auto const result        = cudf::reduce(column, cudf::make_median_aggregation(), out_type);
-    auto const result_scalar = static_cast<cudf::scalar_type_t<decimalXX> *>(result.get());
+    auto const result_scalar = static_cast<cudf::scalar_type_t<decimalXX>*>(result.get());
 
     EXPECT_EQ(result_scalar->fixed_point_value(), expected);
   }
@@ -1375,7 +1375,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointReductionQuantile)
       auto const expected = decimalXX{scaled_integer<RepType>{i + 1, scale}};
       auto const result   = cudf::reduce(
         column, cudf::make_quantile_aggregation({i / 4.0}, cudf::interpolation::LINEAR), out_type);
-      auto const result_scalar = static_cast<cudf::scalar_type_t<decimalXX> *>(result.get());
+      auto const result_scalar = static_cast<cudf::scalar_type_t<decimalXX>*>(result.get());
       EXPECT_EQ(result_scalar->fixed_point_value(), expected);
     }
   }
@@ -1398,7 +1398,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointReductionNthElement)
       auto const expected = decimalXX{scaled_integer<RepType>{values[i], scale}};
       auto const result   = cudf::reduce(
         column, cudf::make_nth_element_aggregation(i, cudf::null_policy::INCLUDE), out_type);
-      auto const result_scalar = static_cast<cudf::scalar_type_t<decimalXX> *>(result.get());
+      auto const result_scalar = static_cast<cudf::scalar_type_t<decimalXX>*>(result.get());
       EXPECT_EQ(result_scalar->fixed_point_value(), expected);
     }
   }
@@ -1611,7 +1611,7 @@ TYPED_TEST(DictionaryReductionTest, Product)
   std::vector<TypeParam> v = convert_values<TypeParam>(int_values);
   cudf::data_type output_type{cudf::type_to_id<T>()};
 
-  auto calc_prod = [](std::vector<T> const &v) {
+  auto calc_prod = [](std::vector<T> const& v) {
     return std::accumulate(v.cbegin(), v.cend(), T{1}, [](T acc, T i) { return acc * i; });
   };
 
@@ -1642,7 +1642,7 @@ TYPED_TEST(DictionaryReductionTest, SumOfSquare)
   std::vector<T> v = convert_values<T>(int_values);
   cudf::data_type output_type{cudf::type_to_id<T>()};
 
-  auto calc_reduction = [](std::vector<T> const &v) {
+  auto calc_reduction = [](std::vector<T> const& v) {
     return std::accumulate(v.cbegin(), v.cend(), T{0}, [](T acc, T i) { return acc + i * i; });
   };
 
@@ -1673,7 +1673,7 @@ TYPED_TEST(DictionaryReductionTest, Mean)
   std::vector<T> v = convert_values<T>(int_values);
   cudf::data_type output_type{cudf::type_to_id<double>()};
 
-  auto calc_mean = [](std::vector<T> const &v, cudf::size_type valid_count) {
+  auto calc_mean = [](std::vector<T> const& v, cudf::size_type valid_count) {
     double sum = std::accumulate(v.cbegin(), v.cend(), double{0});
     return sum / valid_count;
   };
@@ -1710,7 +1710,7 @@ TYPED_TEST(DictionaryReductionTest, DISABLED_VarStd)
   std::vector<T> v = convert_values<T>(int_values);
   cudf::data_type output_type{cudf::type_to_id<double>()};
 
-  auto calc_var = [](std::vector<T> const &v, cudf::size_type valid_count) {
+  auto calc_var = [](std::vector<T> const& v, cudf::size_type valid_count) {
     double mean = std::accumulate(v.cbegin(), v.cend(), double{0});
     mean /= valid_count;
     double sum_of_sq = std::accumulate(
diff --git a/cpp/tests/reductions/scan_tests.cpp b/cpp/tests/reductions/scan_tests.cpp
index 92ba1f9e60f..ef5a66a2019 100644
--- a/cpp/tests/reductions/scan_tests.cpp
+++ b/cpp/tests/reductions/scan_tests.cpp
@@ -21,6 +21,7 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/detail/iterator.cuh>
 #include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/reduction.hpp>
 #include <cudf/strings/string_view.hpp>
@@ -394,3 +395,38 @@ TYPED_TEST(ScanTest, LeadingNulls)
   this->scan_test(v, b, cudf::make_min_aggregation(), scan_type::INCLUSIVE, null_policy::INCLUDE);
   this->scan_test(v, b, cudf::make_min_aggregation(), scan_type::EXCLUSIVE, null_policy::INCLUDE);
 }
+
+class ScanStringsTest : public ScanTest<cudf::string_view> {
+};
+
+TEST_F(ScanStringsTest, MoreStringsMinMax)
+{
+  int row_count = 512;
+
+  auto data_begin = cudf::detail::make_counting_transform_iterator(0, [](auto idx) {
+    char const s[] = {static_cast<char>('a' + (idx % 26)), 0};
+    return std::string(s);
+  });
+  auto validity   = cudf::detail::make_counting_transform_iterator(
+    0, [](auto idx) -> bool { return (idx % 23) != 22; });
+  cudf::test::strings_column_wrapper col(data_begin, data_begin + row_count, validity);
+
+  thrust::host_vector<std::string> v(data_begin, data_begin + row_count);
+  thrust::host_vector<bool> b(validity, validity + row_count);
+
+  this->scan_test(v, {}, cudf::make_min_aggregation(), scan_type::INCLUSIVE);
+  this->scan_test(v, b, cudf::make_min_aggregation(), scan_type::INCLUSIVE);
+  this->scan_test(v, b, cudf::make_min_aggregation(), scan_type::INCLUSIVE, null_policy::EXCLUDE);
+
+  this->scan_test(v, {}, cudf::make_min_aggregation(), scan_type::EXCLUSIVE);
+  this->scan_test(v, b, cudf::make_min_aggregation(), scan_type::EXCLUSIVE);
+  this->scan_test(v, b, cudf::make_min_aggregation(), scan_type::EXCLUSIVE, null_policy::EXCLUDE);
+
+  this->scan_test(v, {}, cudf::make_max_aggregation(), scan_type::INCLUSIVE);
+  this->scan_test(v, b, cudf::make_max_aggregation(), scan_type::INCLUSIVE);
+  this->scan_test(v, b, cudf::make_max_aggregation(), scan_type::INCLUSIVE, null_policy::EXCLUDE);
+
+  this->scan_test(v, {}, cudf::make_max_aggregation(), scan_type::EXCLUSIVE);
+  this->scan_test(v, b, cudf::make_max_aggregation(), scan_type::EXCLUSIVE);
+  this->scan_test(v, b, cudf::make_max_aggregation(), scan_type::EXCLUSIVE, null_policy::EXCLUDE);
+}
diff --git a/cpp/tests/replace/replace_nulls_tests.cpp b/cpp/tests/replace/replace_nulls_tests.cpp
index 2dfa28c5364..0ed4c186c92 100644
--- a/cpp/tests/replace/replace_nulls_tests.cpp
+++ b/cpp/tests/replace/replace_nulls_tests.cpp
@@ -335,11 +335,14 @@ TYPED_TEST(ReplaceNullsTest, ReplacementHasNulls)
 TYPED_TEST(ReplaceNullsTest, LargeScale)
 {
   std::vector<TypeParam> inputColumn(10000);
-  for (size_t i = 0; i < inputColumn.size(); i++) inputColumn[i] = i % 2;
+  for (size_t i = 0; i < inputColumn.size(); i++)
+    inputColumn[i] = i % 2;
   std::vector<cudf::valid_type> inputValid(10000);
-  for (size_t i = 0; i < inputValid.size(); i++) inputValid[i] = i % 2;
+  for (size_t i = 0; i < inputValid.size(); i++)
+    inputValid[i] = i % 2;
   std::vector<TypeParam> expectedColumn(10000);
-  for (size_t i = 0; i < expectedColumn.size(); i++) expectedColumn[i] = 1;
+  for (size_t i = 0; i < expectedColumn.size(); i++)
+    expectedColumn[i] = 1;
 
   ReplaceNullsColumn<TypeParam>(
     cudf::test::fixed_width_column_wrapper<TypeParam>(
@@ -352,11 +355,14 @@ TYPED_TEST(ReplaceNullsTest, LargeScale)
 TYPED_TEST(ReplaceNullsTest, LargeScaleScalar)
 {
   std::vector<TypeParam> inputColumn(10000);
-  for (size_t i = 0; i < inputColumn.size(); i++) inputColumn[i] = i % 2;
+  for (size_t i = 0; i < inputColumn.size(); i++)
+    inputColumn[i] = i % 2;
   std::vector<cudf::valid_type> inputValid(10000);
-  for (size_t i = 0; i < inputValid.size(); i++) inputValid[i] = i % 2;
+  for (size_t i = 0; i < inputValid.size(); i++)
+    inputValid[i] = i % 2;
   std::vector<TypeParam> expectedColumn(10000);
-  for (size_t i = 0; i < expectedColumn.size(); i++) expectedColumn[i] = 1;
+  for (size_t i = 0; i < expectedColumn.size(); i++)
+    expectedColumn[i] = 1;
   cudf::numeric_scalar<TypeParam> replacement(1);
 
   ReplaceNullsScalar<TypeParam>(cudf::test::fixed_width_column_wrapper<TypeParam>(
diff --git a/cpp/tests/search/search_test.cpp b/cpp/tests/search/search_test.cpp
index bf52c2609c4..38fc5abb250 100644
--- a/cpp/tests/search/search_test.cpp
+++ b/cpp/tests/search/search_test.cpp
@@ -644,8 +644,8 @@ TEST_F(SearchTest, contains_nullable_column_false)
 
 TEST_F(SearchTest, empty_table_string)
 {
-  std::vector<const char *> h_col_strings{};
-  std::vector<const char *> h_val_strings{"0", "10", "11", "30", "32", "40", "47", "50", "7", "90"};
+  std::vector<const char*> h_col_strings{};
+  std::vector<const char*> h_val_strings{"0", "10", "11", "30", "32", "40", "47", "50", "7", "90"};
 
   cudf::test::strings_column_wrapper column(
     h_col_strings.begin(),
@@ -673,8 +673,8 @@ TEST_F(SearchTest, empty_table_string)
 
 TEST_F(SearchTest, empty_values_string)
 {
-  std::vector<const char *> h_col_strings{"10", "20", "30", "40", "50"};
-  std::vector<const char *> h_val_strings{};
+  std::vector<const char*> h_col_strings{"10", "20", "30", "40", "50"};
+  std::vector<const char*> h_val_strings{};
 
   cudf::test::strings_column_wrapper column(
     h_col_strings.begin(),
@@ -702,8 +702,8 @@ TEST_F(SearchTest, empty_values_string)
 
 TEST_F(SearchTest, non_null_column__find_first_string)
 {
-  std::vector<const char *> h_col_strings{"10", "20", "30", "40", "50"};
-  std::vector<const char *> h_val_strings{
+  std::vector<const char*> h_col_strings{"10", "20", "30", "40", "50"};
+  std::vector<const char*> h_val_strings{
     "00", "07", "10", "11", "30", "32", "40", "47", "50", "90"};
 
   cudf::test::strings_column_wrapper column(
@@ -732,8 +732,8 @@ TEST_F(SearchTest, non_null_column__find_first_string)
 
 TEST_F(SearchTest, non_null_column__find_last_string)
 {
-  std::vector<const char *> h_col_strings{"10", "20", "30", "40", "50"};
-  std::vector<const char *> h_val_strings{
+  std::vector<const char*> h_col_strings{"10", "20", "30", "40", "50"};
+  std::vector<const char*> h_val_strings{
     "00", "07", "10", "11", "30", "32", "40", "47", "50", "90"};
 
   cudf::test::strings_column_wrapper column(
@@ -762,8 +762,8 @@ TEST_F(SearchTest, non_null_column__find_last_string)
 
 TEST_F(SearchTest, non_null_column_desc__find_first_string)
 {
-  std::vector<const char *> h_col_strings{"50", "40", "30", "20", "10"};
-  std::vector<const char *> h_val_strings{
+  std::vector<const char*> h_col_strings{"50", "40", "30", "20", "10"};
+  std::vector<const char*> h_val_strings{
     "00", "07", "10", "11", "30", "32", "40", "47", "50", "90"};
 
   cudf::test::strings_column_wrapper column(
@@ -792,8 +792,8 @@ TEST_F(SearchTest, non_null_column_desc__find_first_string)
 
 TEST_F(SearchTest, non_null_column_desc__find_last_string)
 {
-  std::vector<const char *> h_col_strings{"50", "40", "30", "20", "10"};
-  std::vector<const char *> h_val_strings{
+  std::vector<const char*> h_col_strings{"50", "40", "30", "20", "10"};
+  std::vector<const char*> h_val_strings{
     "00", "07", "10", "11", "30", "32", "40", "47", "50", "90"};
 
   cudf::test::strings_column_wrapper column(
@@ -822,8 +822,8 @@ TEST_F(SearchTest, non_null_column_desc__find_last_string)
 
 TEST_F(SearchTest, nullable_column__find_last__nulls_as_smallest_string)
 {
-  std::vector<const char *> h_col_strings{nullptr, nullptr, "10", "20", "30", "40", "50"};
-  std::vector<const char *> h_val_strings{
+  std::vector<const char*> h_col_strings{nullptr, nullptr, "10", "20", "30", "40", "50"};
+  std::vector<const char*> h_val_strings{
     nullptr, "08", "10", "11", "30", "32", "40", "47", "50", "90"};
 
   cudf::test::strings_column_wrapper column(
@@ -852,8 +852,8 @@ TEST_F(SearchTest, nullable_column__find_last__nulls_as_smallest_string)
 
 TEST_F(SearchTest, nullable_column__find_first__nulls_as_smallest_string)
 {
-  std::vector<const char *> h_col_strings{nullptr, nullptr, "10", "20", "30", "40", "50"};
-  std::vector<const char *> h_val_strings{
+  std::vector<const char*> h_col_strings{nullptr, nullptr, "10", "20", "30", "40", "50"};
+  std::vector<const char*> h_val_strings{
     nullptr, "08", "10", "11", "30", "32", "40", "47", "50", "90"};
 
   cudf::test::strings_column_wrapper column(
@@ -882,8 +882,8 @@ TEST_F(SearchTest, nullable_column__find_first__nulls_as_smallest_string)
 
 TEST_F(SearchTest, nullable_column__find_last__nulls_as_largest_string)
 {
-  std::vector<const char *> h_col_strings{"10", "20", "30", "40", "50", nullptr, nullptr};
-  std::vector<const char *> h_val_strings{
+  std::vector<const char*> h_col_strings{"10", "20", "30", "40", "50", nullptr, nullptr};
+  std::vector<const char*> h_val_strings{
     "08", "10", "11", "30", "32", "40", "47", "50", "90", nullptr};
 
   cudf::test::strings_column_wrapper column(
@@ -931,8 +931,8 @@ TEST_F(SearchTest, non_null_column__nullable_values__find_last__nulls_as_largest
 
 TEST_F(SearchTest, nullable_column__find_first__nulls_as_largest_string)
 {
-  std::vector<const char *> h_col_strings{"10", "20", "30", "40", "50", nullptr, nullptr};
-  std::vector<const char *> h_val_strings{
+  std::vector<const char*> h_col_strings{"10", "20", "30", "40", "50", nullptr, nullptr};
+  std::vector<const char*> h_val_strings{
     "08", "10", "11", "30", "32", "40", "47", "50", "90", nullptr};
 
   cudf::test::strings_column_wrapper column(
@@ -961,15 +961,15 @@ TEST_F(SearchTest, nullable_column__find_first__nulls_as_largest_string)
 
 TEST_F(SearchTest, table__find_first_string)
 {
-  std::vector<const char *> h_col_0_strings{"10", "20", "20", "20", "20", "20", "50"};
-  std::vector<const char *> h_col_2_strings{"90", "77", "78", "61", "62", "63", "41"};
+  std::vector<const char*> h_col_0_strings{"10", "20", "20", "20", "20", "20", "50"};
+  std::vector<const char*> h_col_2_strings{"90", "77", "78", "61", "62", "63", "41"};
 
-  std::vector<const char *> h_val_0_strings{"0",  "0",  "0",  "0",  "10", "10", "10", "10", "10",
-                                            "10", "10", "10", "11", "20", "20", "20", "20", "20",
-                                            "20", "20", "20", "20", "20", "20", "30", "50", "60"};
-  std::vector<const char *> h_val_2_strings{"0",  "91", "0",  "91", "0",  "79", "90", "91", "77",
-                                            "80", "90", "91", "91", "00", "76", "77", "78", "30",
-                                            "65", "77", "78", "80", "62", "78", "64", "41", "20"};
+  std::vector<const char*> h_val_0_strings{"0",  "0",  "0",  "0",  "10", "10", "10", "10", "10",
+                                           "10", "10", "10", "11", "20", "20", "20", "20", "20",
+                                           "20", "20", "20", "20", "20", "20", "30", "50", "60"};
+  std::vector<const char*> h_val_2_strings{"0",  "91", "0",  "91", "0",  "79", "90", "91", "77",
+                                           "80", "90", "91", "91", "00", "76", "77", "78", "30",
+                                           "65", "77", "78", "80", "62", "78", "64", "41", "20"};
 
   fixed_width_column_wrapper<float> column_1{5.0, .5, .5, .7, .7, .7, .7};
   fixed_width_column_wrapper<float> values_1{0., 0., 6., 5., 0., 5., 5., 5., 5., 6., 6., 6., 9., 0.,
@@ -1030,15 +1030,15 @@ TEST_F(SearchTest, table__find_first_string)
 
 TEST_F(SearchTest, table__find_last_string)
 {
-  std::vector<const char *> h_col_0_strings{"10", "20", "20", "20", "20", "20", "50"};
-  std::vector<const char *> h_col_2_strings{"90", "77", "78", "61", "62", "63", "41"};
+  std::vector<const char*> h_col_0_strings{"10", "20", "20", "20", "20", "20", "50"};
+  std::vector<const char*> h_col_2_strings{"90", "77", "78", "61", "62", "63", "41"};
 
-  std::vector<const char *> h_val_0_strings{"0",  "0",  "0",  "0",  "10", "10", "10", "10", "10",
-                                            "10", "10", "10", "11", "20", "20", "20", "20", "20",
-                                            "20", "20", "20", "20", "20", "20", "30", "50", "60"};
-  std::vector<const char *> h_val_2_strings{"0",  "91", "0",  "91", "0",  "79", "90", "91", "77",
-                                            "80", "90", "91", "91", "00", "76", "77", "78", "30",
-                                            "65", "77", "78", "80", "62", "78", "64", "41", "20"};
+  std::vector<const char*> h_val_0_strings{"0",  "0",  "0",  "0",  "10", "10", "10", "10", "10",
+                                           "10", "10", "10", "11", "20", "20", "20", "20", "20",
+                                           "20", "20", "20", "20", "20", "20", "30", "50", "60"};
+  std::vector<const char*> h_val_2_strings{"0",  "91", "0",  "91", "0",  "79", "90", "91", "77",
+                                           "80", "90", "91", "91", "00", "76", "77", "78", "30",
+                                           "65", "77", "78", "80", "62", "78", "64", "41", "20"};
 
   fixed_width_column_wrapper<float> column_1{5.0, .5, .5, .7, .7, .7, .7};
   fixed_width_column_wrapper<float> values_1{0., 0., 6., 5., 0., 5., 5., 5., 5., 6., 6., 6., 9., 0.,
@@ -1099,15 +1099,15 @@ TEST_F(SearchTest, table__find_last_string)
 
 TEST_F(SearchTest, table_partial_desc__find_first_string)
 {
-  std::vector<const char *> h_col_0_strings{"50", "20", "20", "20", "20", "20", "10"};
-  std::vector<const char *> h_col_2_strings{"41", "78", "77", "63", "62", "61", "90"};
+  std::vector<const char*> h_col_0_strings{"50", "20", "20", "20", "20", "20", "10"};
+  std::vector<const char*> h_col_2_strings{"41", "78", "77", "63", "62", "61", "90"};
 
-  std::vector<const char *> h_val_0_strings{"0",  "0",  "0",  "0",  "10", "10", "10", "10", "10",
-                                            "10", "10", "10", "11", "20", "20", "20", "20", "20",
-                                            "20", "20", "20", "20", "20", "20", "30", "50", "60"};
-  std::vector<const char *> h_val_2_strings{"0",  "91", "0",  "91", "0",  "79", "90", "91", "77",
-                                            "80", "90", "91", "91", "00", "76", "77", "78", "30",
-                                            "65", "77", "78", "80", "62", "78", "64", "41", "20"};
+  std::vector<const char*> h_val_0_strings{"0",  "0",  "0",  "0",  "10", "10", "10", "10", "10",
+                                           "10", "10", "10", "11", "20", "20", "20", "20", "20",
+                                           "20", "20", "20", "20", "20", "20", "30", "50", "60"};
+  std::vector<const char*> h_val_2_strings{"0",  "91", "0",  "91", "0",  "79", "90", "91", "77",
+                                           "80", "90", "91", "91", "00", "76", "77", "78", "30",
+                                           "65", "77", "78", "80", "62", "78", "64", "41", "20"};
 
   fixed_width_column_wrapper<float> column_1{.7, .5, .5, .7, .7, .7, 5.0};
   fixed_width_column_wrapper<float> values_1{0., 0., 6., 5., 0., 5., 5., 5., 5., 6., 6., 6., 9., 0.,
@@ -1168,15 +1168,15 @@ TEST_F(SearchTest, table_partial_desc__find_first_string)
 
 TEST_F(SearchTest, table_partial_desc__find_last_string)
 {
-  std::vector<const char *> h_col_0_strings{"50", "20", "20", "20", "20", "20", "10"};
-  std::vector<const char *> h_col_2_strings{"41", "78", "77", "63", "62", "61", "90"};
+  std::vector<const char*> h_col_0_strings{"50", "20", "20", "20", "20", "20", "10"};
+  std::vector<const char*> h_col_2_strings{"41", "78", "77", "63", "62", "61", "90"};
 
-  std::vector<const char *> h_val_0_strings{"0",  "0",  "0",  "0",  "10", "10", "10", "10", "10",
-                                            "10", "10", "10", "11", "20", "20", "20", "20", "20",
-                                            "20", "20", "20", "20", "20", "20", "30", "50", "60"};
-  std::vector<const char *> h_val_2_strings{"0",  "91", "0",  "91", "0",  "79", "90", "91", "77",
-                                            "80", "90", "91", "91", "00", "76", "77", "78", "30",
-                                            "65", "77", "78", "80", "62", "78", "64", "41", "20"};
+  std::vector<const char*> h_val_0_strings{"0",  "0",  "0",  "0",  "10", "10", "10", "10", "10",
+                                           "10", "10", "10", "11", "20", "20", "20", "20", "20",
+                                           "20", "20", "20", "20", "20", "20", "30", "50", "60"};
+  std::vector<const char*> h_val_2_strings{"0",  "91", "0",  "91", "0",  "79", "90", "91", "77",
+                                           "80", "90", "91", "91", "00", "76", "77", "78", "30",
+                                           "65", "77", "78", "80", "62", "78", "64", "41", "20"};
 
   fixed_width_column_wrapper<float> column_1{.7, .5, .5, .7, .7, .7, 5.0};
 
@@ -1238,13 +1238,13 @@ TEST_F(SearchTest, table_partial_desc__find_last_string)
 
 TEST_F(SearchTest, table__find_first__nulls_as_smallest_string)
 {
-  std::vector<const char *> h_col_0_strings{
+  std::vector<const char*> h_col_0_strings{
     nullptr, "10", "10", "20", "20", "20", "20", "20", "20", "20", "50"};
-  std::vector<const char *> h_col_2_strings{
+  std::vector<const char*> h_col_2_strings{
     "50", "95", "90", nullptr, nullptr, "77", "78", "61", "62", "63", "41"};
 
-  std::vector<const char *> h_val_0_strings{"10", nullptr, "20"};
-  std::vector<const char *> h_val_2_strings{"95", "50", nullptr};
+  std::vector<const char*> h_val_0_strings{"10", nullptr, "20"};
+  std::vector<const char*> h_val_2_strings{"95", "50", nullptr};
 
   fixed_width_column_wrapper<float> column_1{{.5, 6.0, 5.0, .5, .5, .5, .5, .7, .7, .7, .7},
                                              {1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1}};
@@ -1305,13 +1305,13 @@ TEST_F(SearchTest, table__find_first__nulls_as_smallest_string)
 
 TEST_F(SearchTest, table__find_last__nulls_as_smallest_string)
 {
-  std::vector<const char *> h_col_0_strings{
+  std::vector<const char*> h_col_0_strings{
     nullptr, "10", "10", "20", "20", "20", "20", "20", "20", "20", "50"};
-  std::vector<const char *> h_col_2_strings{
+  std::vector<const char*> h_col_2_strings{
     "50", "90", "95", nullptr, nullptr, "77", "78", "61", "62", "63", "41"};
 
-  std::vector<const char *> h_val_0_strings{"10", nullptr, "20"};
-  std::vector<const char *> h_val_2_strings{"95", "50", nullptr};
+  std::vector<const char*> h_val_0_strings{"10", nullptr, "20"};
+  std::vector<const char*> h_val_2_strings{"95", "50", nullptr};
 
   fixed_width_column_wrapper<float> column_1{{.5, 6.0, 5.0, .5, .5, .5, .5, .7, .7, .7, .7},
                                              {1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1}};
@@ -1372,13 +1372,13 @@ TEST_F(SearchTest, table__find_last__nulls_as_smallest_string)
 
 TEST_F(SearchTest, table__find_first__nulls_as_largest_string)
 {
-  std::vector<const char *> h_col_0_strings{
+  std::vector<const char*> h_col_0_strings{
     "10", "10", "20", "20", "20", "20", "20", "20", "20", "50", nullptr};
-  std::vector<const char *> h_col_2_strings{
+  std::vector<const char*> h_col_2_strings{
     "90", "95", "77", "78", nullptr, nullptr, "61", "62", "63", "41", "50"};
 
-  std::vector<const char *> h_val_0_strings{"10", nullptr, "20"};
-  std::vector<const char *> h_val_2_strings{"95", "50", nullptr};
+  std::vector<const char*> h_val_0_strings{"10", nullptr, "20"};
+  std::vector<const char*> h_val_2_strings{"95", "50", nullptr};
 
   fixed_width_column_wrapper<float> column_1{{5.0, 6.0, .5, .5, .5, .5, .7, .7, .7, .7, .5},
                                              {1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1}};
@@ -1439,13 +1439,13 @@ TEST_F(SearchTest, table__find_first__nulls_as_largest_string)
 
 TEST_F(SearchTest, table__find_last__nulls_as_largest_string)
 {
-  std::vector<const char *> h_col_0_strings{
+  std::vector<const char*> h_col_0_strings{
     "10", "10", "20", "20", "20", "20", "20", "20", "20", "50", nullptr};
-  std::vector<const char *> h_col_2_strings{
+  std::vector<const char*> h_col_2_strings{
     "90", "95", "77", "78", nullptr, nullptr, "61", "62", "63", "41", "50"};
 
-  std::vector<const char *> h_val_0_strings{"10", nullptr, "20"};
-  std::vector<const char *> h_val_2_strings{"95", "50", nullptr};
+  std::vector<const char*> h_val_0_strings{"10", nullptr, "20"};
+  std::vector<const char*> h_val_2_strings{"95", "50", nullptr};
 
   fixed_width_column_wrapper<float> column_1{{5.0, 6.0, .5, .5, .5, .5, .7, .7, .7, .7, .5},
                                              {1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1}};
@@ -1506,7 +1506,7 @@ TEST_F(SearchTest, table__find_last__nulls_as_largest_string)
 
 TEST_F(SearchTest, contains_true_string)
 {
-  std::vector<const char *> h_col_strings{"00", "01", "17", "19", "23", "29", "71"};
+  std::vector<const char*> h_col_strings{"00", "01", "17", "19", "23", "29", "71"};
   string_scalar scalar{"23"};
 
   cudf::test::strings_column_wrapper column(
@@ -1525,7 +1525,7 @@ TEST_F(SearchTest, contains_true_string)
 
 TEST_F(SearchTest, contains_false_string)
 {
-  std::vector<const char *> h_col_strings{"0", "1", "17", "19", "23", "29", "71"};
+  std::vector<const char*> h_col_strings{"0", "1", "17", "19", "23", "29", "71"};
   string_scalar scalar{"24"};
 
   cudf::test::strings_column_wrapper column(
@@ -1544,7 +1544,7 @@ TEST_F(SearchTest, contains_false_string)
 
 TEST_F(SearchTest, contains_empty_value_string)
 {
-  std::vector<const char *> h_col_strings{"0", "1", "17", "19", "23", "29", "71"};
+  std::vector<const char*> h_col_strings{"0", "1", "17", "19", "23", "29", "71"};
   string_scalar scalar{"23", false};
 
   cudf::test::strings_column_wrapper column(
@@ -1563,7 +1563,7 @@ TEST_F(SearchTest, contains_empty_value_string)
 
 TEST_F(SearchTest, contains_empty_column_string)
 {
-  std::vector<const char *> h_col_strings{};
+  std::vector<const char*> h_col_strings{};
   string_scalar scalar{"24"};
 
   cudf::test::strings_column_wrapper column(
@@ -1582,7 +1582,7 @@ TEST_F(SearchTest, contains_empty_column_string)
 
 TEST_F(SearchTest, contains_nullable_column_true_string)
 {
-  std::vector<const char *> h_col_strings{nullptr, nullptr, "17", "19", "23", "29", "71"};
+  std::vector<const char*> h_col_strings{nullptr, nullptr, "17", "19", "23", "29", "71"};
   string_scalar scalar{"23"};
 
   cudf::test::strings_column_wrapper column(
@@ -1601,7 +1601,7 @@ TEST_F(SearchTest, contains_nullable_column_true_string)
 
 TEST_F(SearchTest, contains_nullable_column_false_string)
 {
-  std::vector<const char *> h_col_strings{nullptr, nullptr, "17", "19", nullptr, "29", "71"};
+  std::vector<const char*> h_col_strings{nullptr, nullptr, "17", "19", nullptr, "29", "71"};
   string_scalar scalar{"23"};
 
   cudf::test::strings_column_wrapper column(
@@ -1648,8 +1648,8 @@ TEST_F(SearchTest, multi_contains_none)
 
 TEST_F(SearchTest, multi_contains_some_string)
 {
-  std::vector<const char *> h_haystack_strings{"0", "1", "17", "19", "23", "29", "71"};
-  std::vector<const char *> h_needles_strings{"17", "19", "45", "72"};
+  std::vector<const char*> h_haystack_strings{"0", "1", "17", "19", "23", "29", "71"};
+  std::vector<const char*> h_needles_strings{"17", "19", "45", "72"};
 
   cudf::test::strings_column_wrapper haystack(h_haystack_strings.begin(), h_haystack_strings.end());
 
@@ -1664,8 +1664,8 @@ TEST_F(SearchTest, multi_contains_some_string)
 
 TEST_F(SearchTest, multi_contains_none_string)
 {
-  std::vector<const char *> h_haystack_strings{"0", "1", "17", "19", "23", "29", "71"};
-  std::vector<const char *> h_needles_strings{"2", "3"};
+  std::vector<const char*> h_haystack_strings{"0", "1", "17", "19", "23", "29", "71"};
+  std::vector<const char*> h_needles_strings{"2", "3"};
 
   cudf::test::strings_column_wrapper haystack(h_haystack_strings.begin(), h_haystack_strings.end());
 
@@ -1710,8 +1710,8 @@ TEST_F(SearchTest, multi_contains_none_with_nulls)
 
 TEST_F(SearchTest, multi_contains_some_string_with_nulls)
 {
-  std::vector<const char *> h_haystack_strings{"0", "1", nullptr, "19", "23", "29", "71"};
-  std::vector<const char *> h_needles_strings{"17", "23", nullptr, "72"};
+  std::vector<const char*> h_haystack_strings{"0", "1", nullptr, "19", "23", "29", "71"};
+  std::vector<const char*> h_needles_strings{"17", "23", nullptr, "72"};
 
   fixed_width_column_wrapper<bool> expect{{0, 0, 0, 0, 1, 0, 0}, {1, 1, 0, 1, 1, 1, 1}};
 
@@ -1734,8 +1734,8 @@ TEST_F(SearchTest, multi_contains_some_string_with_nulls)
 
 TEST_F(SearchTest, multi_contains_none_string_with_nulls)
 {
-  std::vector<const char *> h_haystack_strings{"0", "1", nullptr, "19", "23", "29", "71"};
-  std::vector<const char *> h_needles_strings{"2", nullptr};
+  std::vector<const char*> h_haystack_strings{"0", "1", nullptr, "19", "23", "29", "71"};
+  std::vector<const char*> h_needles_strings{"2", nullptr};
 
   fixed_width_column_wrapper<bool> expect{{0, 0, 0, 0, 0, 0, 0}, {1, 1, 0, 1, 1, 1, 1}};
 
@@ -1772,8 +1772,8 @@ TEST_F(SearchTest, multi_contains_empty_column)
 
 TEST_F(SearchTest, multi_contains_empty_column_string)
 {
-  std::vector<const char *> h_haystack_strings{};
-  std::vector<const char *> h_needles_strings{"17", "19", "45", "72"};
+  std::vector<const char*> h_haystack_strings{};
+  std::vector<const char*> h_needles_strings{"17", "19", "45", "72"};
 
   cudf::test::strings_column_wrapper haystack(h_haystack_strings.begin(), h_haystack_strings.end());
 
@@ -1802,8 +1802,8 @@ TEST_F(SearchTest, multi_contains_empty_input_set)
 
 TEST_F(SearchTest, multi_contains_empty_input_set_string)
 {
-  std::vector<const char *> h_haystack_strings{"0", "1", "17", "19", "23", "29", "71"};
-  std::vector<const char *> h_needles_strings{};
+  std::vector<const char*> h_haystack_strings{"0", "1", "17", "19", "23", "29", "71"};
+  std::vector<const char*> h_needles_strings{};
 
   cudf::test::strings_column_wrapper haystack(h_haystack_strings.begin(), h_haystack_strings.end());
 
diff --git a/cpp/tests/sort/rank_test.cpp b/cpp/tests/sort/rank_test.cpp
index 88c0f699252..dfae68f77cb 100644
--- a/cpp/tests/sort/rank_test.cpp
+++ b/cpp/tests/sort/rank_test.cpp
@@ -40,7 +40,7 @@ void run_rank_test(table_view input,
                    bool debug = false)
 {
   int i = 0;
-  for (auto &&input_column : input) {
+  for (auto&& input_column : input) {
     // Rank
     auto got_rank_column =
       cudf::rank(input_column, method, column_order, null_handling, null_precedence, percentage);
@@ -77,7 +77,7 @@ struct Rank : public BaseFixture {
                      bool percentage = false)
   {
     if (std::is_same<T, bool>::value) return;
-    for (auto const &test_case : {
+    for (auto const& test_case : {
            // Non-null column
            test_case_t{table_view{{col1}}, table_view{{col1_rank}}},
            // Null column
diff --git a/cpp/tests/sort/segmented_sort_tests.cpp b/cpp/tests/sort/segmented_sort_tests.cpp
index e907212c9e8..1e5cb941392 100644
--- a/cpp/tests/sort/segmented_sort_tests.cpp
+++ b/cpp/tests/sort/segmented_sort_tests.cpp
@@ -264,7 +264,7 @@ TEST_F(SegmentedSortInt, ErrorsMismatchArgSizes)
                                            {order::ASCENDING, order::ASCENDING},
                                            {null_order::AFTER, null_order::AFTER}),
                logic_error);
-  // segmented_offsets beyond num_rows - undefined behaviour, no throw.
+  // segmented_offsets beyond num_rows - undefined behavior, no throw.
   CUDF_EXPECT_NO_THROW(cudf::segmented_sort_by_key(input1, input1, col2));
 }
 
diff --git a/cpp/tests/strings/case_tests.cpp b/cpp/tests/strings/case_tests.cpp
index f04905282df..da55e967266 100644
--- a/cpp/tests/strings/case_tests.cpp
+++ b/cpp/tests/strings/case_tests.cpp
@@ -97,36 +97,34 @@ TEST_F(StringsCaseTest, Swapcase)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
-TEST_F(StringsCaseTest, EmptyStringsColumn)
-{
-  cudf::column_view zero_size_strings_column(
-    cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
-  auto strings_view = cudf::strings_column_view(zero_size_strings_column);
-  auto results      = cudf::strings::to_lower(strings_view);
-  auto view         = results->view();
-  cudf::test::expect_strings_empty(results->view());
-}
-
 TEST_F(StringsCaseTest, Capitalize)
 {
-  std::vector<const char*> h_strings{
-    "SȺȺnich xyZ", "Examples aBc", "thesé", nullptr, "ARE THE", "tést strings", ""};
-  std::vector<const char*> h_expected{
-    "Sⱥⱥnich xyz", "Examples abc", "Thesé", nullptr, "Are the", "Tést strings", ""};
-
   cudf::test::strings_column_wrapper strings(
-    h_strings.begin(),
-    h_strings.end(),
-    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
+    {"SȺȺnich xyZ", "Examples aBc", "thesé", "", "ARE\tTHE", "tést\tstrings", ""},
+    {1, 1, 1, 0, 1, 1, 1});
   auto strings_view = cudf::strings_column_view(strings);
 
-  auto results = cudf::strings::capitalize(strings_view);
-
-  cudf::test::strings_column_wrapper expected(
-    h_expected.begin(),
-    h_expected.end(),
-    thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; }));
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  {
+    auto results = cudf::strings::capitalize(strings_view);
+    cudf::test::strings_column_wrapper expected(
+      {"Sⱥⱥnich xyz", "Examples abc", "Thesé", "", "Are\tthe", "Tést\tstrings", ""},
+      {1, 1, 1, 0, 1, 1, 1});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  }
+  {
+    auto results = cudf::strings::capitalize(strings_view, std::string(" "));
+    cudf::test::strings_column_wrapper expected(
+      {"Sⱥⱥnich Xyz", "Examples Abc", "Thesé", "", "Are\tthe", "Tést\tstrings", ""},
+      {1, 1, 1, 0, 1, 1, 1});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  }
+  {
+    auto results = cudf::strings::capitalize(strings_view, std::string(" \t"));
+    cudf::test::strings_column_wrapper expected(
+      {"Sⱥⱥnich Xyz", "Examples Abc", "Thesé", "", "Are\tThe", "Tést\tStrings", ""},
+      {1, 1, 1, 0, 1, 1, 1});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  }
 }
 
 TEST_F(StringsCaseTest, Title)
@@ -153,13 +151,18 @@ TEST_F(StringsCaseTest, Title)
 
 TEST_F(StringsCaseTest, MultiCharUpper)
 {
-  cudf::test::strings_column_wrapper strings{"\u1f52", "\u1f83", "\u1e98", "\ufb05", "\u0149"};
+  cudf::test::strings_column_wrapper strings{"\u1f52 \u1f83", "\u1e98 \ufb05", "\u0149"};
   cudf::test::strings_column_wrapper expected{
-    "\u03a5\u0313\u0300", "\u1f0b\u0399", "\u0057\u030a", "\u0053\u0054", "\u02bc\u004e"};
+    "\u03a5\u0313\u0300 \u1f0b\u0399", "\u0057\u030a \u0053\u0054", "\u02bc\u004e"};
   auto strings_view = cudf::strings_column_view(strings);
 
   auto results = cudf::strings::to_upper(strings_view);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  results = cudf::strings::capitalize(strings_view, std::string(" "));
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 
+  results = cudf::strings::title(strings_view);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
@@ -174,3 +177,33 @@ TEST_F(StringsCaseTest, MultiCharLower)
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
+
+TEST_F(StringsCaseTest, EmptyStringsColumn)
+{
+  cudf::column_view zero_size_strings_column(
+    cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
+  auto strings_view = cudf::strings_column_view(zero_size_strings_column);
+
+  auto results = cudf::strings::to_lower(strings_view);
+  cudf::test::expect_strings_empty(results->view());
+
+  results = cudf::strings::to_upper(strings_view);
+  cudf::test::expect_strings_empty(results->view());
+
+  results = cudf::strings::swapcase(strings_view);
+  cudf::test::expect_strings_empty(results->view());
+
+  results = cudf::strings::capitalize(strings_view);
+  cudf::test::expect_strings_empty(results->view());
+
+  results = cudf::strings::title(strings_view);
+  cudf::test::expect_strings_empty(results->view());
+}
+
+TEST_F(StringsCaseTest, ErrorTest)
+{
+  cudf::test::strings_column_wrapper input{"the column intentionally left blank"};
+  auto view = cudf::strings_column_view(input);
+
+  EXPECT_THROW(cudf::strings::capitalize(view, cudf::string_scalar("", false)), cudf::logic_error);
+}
diff --git a/cpp/tests/strings/chars_types_tests.cpp b/cpp/tests/strings/chars_types_tests.cpp
index 702329edaba..17e08bd21c5 100644
--- a/cpp/tests/strings/chars_types_tests.cpp
+++ b/cpp/tests/strings/chars_types_tests.cpp
@@ -73,7 +73,8 @@ TEST_P(StringsCharsTestTypes, AllTypes)
   int x             = static_cast<int>(is_parm);
   int index         = 0;
   int strings_count = static_cast<int>(h_strings.size());
-  while (x >>= 1) ++index;
+  while (x >>= 1)
+    ++index;
   bool* sub_expected = &expecteds[index * strings_count];
 
   cudf::test::fixed_width_column_wrapper<bool> expected(
diff --git a/cpp/tests/strings/hash_string.cu b/cpp/tests/strings/hash_string.cu
index 023d648cfdf..b5298d39bda 100644
--- a/cpp/tests/strings/hash_string.cu
+++ b/cpp/tests/strings/hash_string.cu
@@ -74,5 +74,6 @@ TEST_F(StringsHashTest, HashTest)
   uint32_t h_expected[] = {
     2739798893, 2739798893, 3506676360, 1891213601, 3778137224, 0, 0, 1551088011};
   auto h_values = cudf::detail::make_host_vector_sync(d_values);
-  for (uint32_t idx = 0; idx < h_values.size(); ++idx) EXPECT_EQ(h_values[idx], h_expected[idx]);
+  for (uint32_t idx = 0; idx < h_values.size(); ++idx)
+    EXPECT_EQ(h_values[idx], h_expected[idx]);
 }
diff --git a/cpp/tests/strings/json_tests.cpp b/cpp/tests/strings/json_tests.cpp
index 5c81057b6d7..dfcc646a8f6 100644
--- a/cpp/tests/strings/json_tests.cpp
+++ b/cpp/tests/strings/json_tests.cpp
@@ -76,10 +76,10 @@ std::unique_ptr<cudf::column> drop_whitespace(cudf::column_view const& col)
   return cudf::strings::replace(strings, targets, replacements);
 }
 
-struct JsonTests : public cudf::test::BaseFixture {
+struct JsonPathTests : public cudf::test::BaseFixture {
 };
 
-TEST_F(JsonTests, GetJsonObjectRootOp)
+TEST_F(JsonPathTests, GetJsonObjectRootOp)
 {
   // root
   cudf::test::strings_column_wrapper input{json_string};
@@ -92,7 +92,7 @@ TEST_F(JsonTests, GetJsonObjectRootOp)
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected);
 }
 
-TEST_F(JsonTests, GetJsonObjectChildOp)
+TEST_F(JsonPathTests, GetJsonObjectChildOp)
 {
   {
     cudf::test::strings_column_wrapper input{json_string};
@@ -187,7 +187,7 @@ TEST_F(JsonTests, GetJsonObjectChildOp)
   }
 }
 
-TEST_F(JsonTests, GetJsonObjectWildcardOp)
+TEST_F(JsonPathTests, GetJsonObjectWildcardOp)
 {
   {
     cudf::test::strings_column_wrapper input{json_string};
@@ -291,7 +291,7 @@ TEST_F(JsonTests, GetJsonObjectWildcardOp)
   }
 }
 
-TEST_F(JsonTests, GetJsonObjectSubscriptOp)
+TEST_F(JsonPathTests, GetJsonObjectSubscriptOp)
 {
   {
     cudf::test::strings_column_wrapper input{json_string};
@@ -378,7 +378,7 @@ TEST_F(JsonTests, GetJsonObjectSubscriptOp)
   }
 }
 
-TEST_F(JsonTests, GetJsonObjectFilter)
+TEST_F(JsonPathTests, GetJsonObjectFilter)
 {
   // queries that result in filtering/collating results (mostly meaning - generates new
   // json instead of just returning parts of the existing string
@@ -449,7 +449,7 @@ TEST_F(JsonTests, GetJsonObjectFilter)
   }
 }
 
-TEST_F(JsonTests, GetJsonObjectNullInputs)
+TEST_F(JsonPathTests, GetJsonObjectNullInputs)
 {
   {
     std::string str("{\"a\" : \"b\"}");
@@ -466,7 +466,7 @@ TEST_F(JsonTests, GetJsonObjectNullInputs)
   }
 }
 
-TEST_F(JsonTests, GetJsonObjectEmptyQuery)
+TEST_F(JsonPathTests, GetJsonObjectEmptyQuery)
 {
   // empty query -> null
   {
@@ -480,7 +480,7 @@ TEST_F(JsonTests, GetJsonObjectEmptyQuery)
   }
 }
 
-TEST_F(JsonTests, GetJsonObjectEmptyInputsAndOutputs)
+TEST_F(JsonPathTests, GetJsonObjectEmptyInputsAndOutputs)
 {
   // empty input -> null
   {
@@ -508,7 +508,7 @@ TEST_F(JsonTests, GetJsonObjectEmptyInputsAndOutputs)
 }
 
 // badly formed JSONpath strings
-TEST_F(JsonTests, GetJsonObjectIllegalQuery)
+TEST_F(JsonPathTests, GetJsonObjectIllegalQuery)
 {
   // can't have more than one root operator, or a root operator anywhere other
   // than the beginning
@@ -581,7 +581,7 @@ TEST_F(JsonTests, GetJsonObjectIllegalQuery)
 }
 
 // queries that are legal, but reference invalid parts of the input
-TEST_F(JsonTests, GetJsonObjectInvalidQuery)
+TEST_F(JsonPathTests, GetJsonObjectInvalidQuery)
 {
   // non-existent field
   {
@@ -628,7 +628,7 @@ TEST_F(JsonTests, GetJsonObjectInvalidQuery)
   }
 }
 
-TEST_F(JsonTests, MixedOutput)
+TEST_F(JsonPathTests, MixedOutput)
 {
   // various queries on:
   // clang-format off
@@ -760,7 +760,7 @@ TEST_F(JsonTests, MixedOutput)
   }
 }
 
-TEST_F(JsonTests, StripQuotes)
+TEST_F(JsonPathTests, StripQuotes)
 {
   // we normally expect our outputs here to be
   // b     (no quotes)
@@ -801,7 +801,7 @@ TEST_F(JsonTests, StripQuotes)
   }
 }
 
-TEST_F(JsonTests, AllowSingleQuotes)
+TEST_F(JsonPathTests, AllowSingleQuotes)
 {
   // Tests allowing single quotes for strings.
   // Note:  this flag allows a mix of single and double quotes. it doesn't explicitly require
@@ -876,3 +876,95 @@ TEST_F(JsonTests, AllowSingleQuotes)
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
   }
 }
+
+TEST_F(JsonPathTests, StringsWithSpecialChars)
+{
+  // make sure we properly handle strings containing special characters
+  // like { } [ ], etc
+  // various queries on:
+
+  {
+    std::vector<std::string> input_strings{
+      // clang-format off
+      "{\"item\" : [{\"key\" : \"value[\"}]}",
+      // clang-format on
+    };
+
+    cudf::test::strings_column_wrapper input(input_strings.begin(), input_strings.end());
+    {
+      std::string json_path("$.item");
+
+      cudf::strings::get_json_object_options options;
+      options.set_allow_single_quotes(true);
+
+      auto result =
+        cudf::strings::get_json_object(cudf::strings_column_view(input), json_path, options);
+
+      // clang-format off
+      cudf::test::strings_column_wrapper expected({
+        "[{\"key\" : \"value[\"}]",
+      });
+      // clang-format on
+      CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
+    }
+  }
+
+  {
+    std::vector<std::string> input_strings{
+      // clang-format off
+      "{\"a\" : \"[}{}][][{[\\\"}}[\\\"]\"}",
+      // clang-format on
+    };
+
+    cudf::test::strings_column_wrapper input(input_strings.begin(), input_strings.end());
+    {
+      std::string json_path("$.a");
+
+      cudf::strings::get_json_object_options options;
+      options.set_allow_single_quotes(true);
+
+      auto result =
+        cudf::strings::get_json_object(cudf::strings_column_view(input), json_path, options);
+
+      // clang-format off
+      cudf::test::strings_column_wrapper expected({
+        "[}{}][][{[\\\"}}[\\\"]",
+      });
+      // clang-format on
+      CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
+    }
+  }
+}
+
+TEST_F(JsonPathTests, EscapeSequences)
+{
+  // valid escape sequences in JSON include
+  // \" \\ \/ \b \f \n \r \t
+  // \uXXXX  where X is a valid hex digit
+
+  std::vector<std::string> input_strings{
+    // clang-format off
+    "{\"a\" : \"\\\" \\\\ \\/ \\b \\f \\n \\r \\t\"}",
+    "{\"a\" : \"\\u1248 \\uacdf \\uACDF \\u10EF\"}"
+    // clang-format on
+  };
+
+  cudf::test::strings_column_wrapper input(input_strings.begin(), input_strings.end());
+  {
+    std::string json_path("$.a");
+
+    cudf::strings::get_json_object_options options;
+    options.set_allow_single_quotes(true);
+
+    auto result =
+      cudf::strings::get_json_object(cudf::strings_column_view(input), json_path, options);
+
+    // clang-format off
+    cudf::test::strings_column_wrapper expected({
+      "\\\" \\\\ \\/ \\b \\f \\n \\r \\t",
+      "\\u1248 \\uacdf \\uACDF \\u10EF"
+    });
+    // clang-format on
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
+  }
+}
\ No newline at end of file
diff --git a/cpp/tests/strings/repeat_strings_tests.cpp b/cpp/tests/strings/repeat_strings_tests.cpp
index a229e1b468a..feca4b25c4d 100644
--- a/cpp/tests/strings/repeat_strings_tests.cpp
+++ b/cpp/tests/strings/repeat_strings_tests.cpp
@@ -71,7 +71,7 @@ TEST_F(RepeatJoinStringTest, ValidStringScalar)
     EXPECT_EQ(result->size(), 0);
   }
 
-  // Negatitve repeat times.
+  // Negative repeat times.
   {
     auto const result = cudf::strings::repeat_strings(str, -10);
     EXPECT_EQ(result->is_valid(), true);
diff --git a/cpp/tests/table/table_view_tests.cu b/cpp/tests/table/table_view_tests.cu
index 1fb4b88c79e..3a792573108 100644
--- a/cpp/tests/table/table_view_tests.cu
+++ b/cpp/tests/table/table_view_tests.cu
@@ -31,7 +31,7 @@
 #include <vector>
 
 // Compares two tables row by row, if table1 row is less than table2, then corresponding row value
-// in `ouput` would be `true`/1 else `false`/0.
+// in `output` would be `true`/1 else `false`/0.
 struct TableViewTest : public cudf::test::BaseFixture {
 };
 void row_comparison(cudf::table_view input1,
diff --git a/cpp/tests/text/subword_tests.cpp b/cpp/tests/text/subword_tests.cpp
index d23ca3033cc..65cc466fee7 100644
--- a/cpp/tests/text/subword_tests.cpp
+++ b/cpp/tests/text/subword_tests.cpp
@@ -46,7 +46,8 @@ void create_hashed_vocab(std::string const& hash_file)
   std::vector<std::pair<int, int>> coefficients(23, {65559, 0});
   std::ofstream outfile(hash_file, std::ofstream::out);
   outfile << "1\n0\n" << coefficients.size() << "\n";
-  for (auto c : coefficients) outfile << c.first << " " << c.second << "\n";
+  for (auto c : coefficients)
+    outfile << c.first << " " << c.second << "\n";
   std::vector<uint64_t> hash_table(23, 0);
   outfile << hash_table.size() << "\n";
   hash_table[0]  = 3015668L;              // based on values
@@ -54,7 +55,8 @@ void create_hashed_vocab(std::string const& hash_file)
   hash_table[5]  = 6358029;               // bert_hash_table.txt
   hash_table[16] = 451412625363L;         // file for the test
   hash_table[20] = 6206321707968235495L;  // words above
-  for (auto h : hash_table) outfile << h << "\n";
+  for (auto h : hash_table)
+    outfile << h << "\n";
   outfile << "100\n101\n102\n\n";
 }
 
diff --git a/cpp/tests/transpose/transpose_test.cpp b/cpp/tests/transpose/transpose_test.cpp
index ec15a1b8423..8d81f4dce84 100644
--- a/cpp/tests/transpose/transpose_test.cpp
+++ b/cpp/tests/transpose/transpose_test.cpp
@@ -51,7 +51,9 @@ auto transpose_vectors(std::vector<std::vector<T>> const& input)
     transposed.begin(), transposed.end(), [=](std::vector<T>& col) { col.resize(ncols); });
 
   for (size_t col = 0; col < input.size(); ++col) {
-    for (size_t row = 0; row < nrows; ++row) { transposed[row][col] = input[col][row]; }
+    for (size_t row = 0; row < nrows; ++row) {
+      transposed[row][col] = input[col][row];
+    }
   }
 
   return transposed;
@@ -63,7 +65,9 @@ auto make_columns(std::vector<std::vector<T>> const& values)
   std::vector<fixed_width_column_wrapper<T>> columns;
   columns.reserve(values.size());
 
-  for (auto const& value_col : values) { columns.emplace_back(value_col.begin(), value_col.end()); }
+  for (auto const& value_col : values) {
+    columns.emplace_back(value_col.begin(), value_col.end());
+  }
 
   return columns;
 }
diff --git a/cpp/tests/utilities/column_utilities.cu b/cpp/tests/utilities/column_utilities.cu
index 2ff06436853..7177f78e652 100644
--- a/cpp/tests/utilities/column_utilities.cu
+++ b/cpp/tests/utilities/column_utilities.cu
@@ -683,7 +683,7 @@ struct column_view_printer {
   {
     lists_column_view lcv(col);
 
-    // propage slicing to the child if necessary
+    // propagate slicing to the child if necessary
     column_view child    = lcv.get_sliced_child(rmm::cuda_stream_default);
     bool const is_sliced = lcv.offset() > 0 || child.offset() > 0;
 
diff --git a/cpp/tests/utilities_tests/span_tests.cu b/cpp/tests/utilities_tests/span_tests.cu
index 22e15809a2d..a9a5151e7c3 100644
--- a/cpp/tests/utilities_tests/span_tests.cu
+++ b/cpp/tests/utilities_tests/span_tests.cu
@@ -45,7 +45,9 @@ template <typename Iterator1, typename T>
 void expect_match(Iterator1 expected, size_t expected_size, host_span<T> input)
 {
   EXPECT_EQ(expected_size, input.size());
-  for (size_t i = 0; i < expected_size; i++) { EXPECT_EQ(*(expected + i), *(input.begin() + i)); }
+  for (size_t i = 0; i < expected_size; i++) {
+    EXPECT_EQ(*(expected + i), *(input.begin() + i));
+  }
 }
 
 template <typename T>
diff --git a/cpp/tests/utilities_tests/type_check_tests.cpp b/cpp/tests/utilities_tests/type_check_tests.cpp
new file mode 100644
index 00000000000..bd94f724776
--- /dev/null
+++ b/cpp/tests/utilities_tests/type_check_tests.cpp
@@ -0,0 +1,219 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/fixed_point/fixed_point.hpp>
+#include <cudf/utilities/type_checks.hpp>
+#include <cudf/wrappers/durations.hpp>
+#include <cudf/wrappers/timestamps.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/type_lists.hpp>
+
+namespace cudf {
+namespace test {
+
+template <typename T>
+struct ColumnTypeCheckTestTyped : public cudf::test::BaseFixture {
+};
+
+struct ColumnTypeCheckTest : public cudf::test::BaseFixture {
+};
+
+TYPED_TEST_CASE(ColumnTypeCheckTestTyped, cudf::test::FixedWidthTypes);
+
+TYPED_TEST(ColumnTypeCheckTestTyped, SameFixedWidth)
+{
+  fixed_width_column_wrapper<TypeParam> lhs{1, 1}, rhs{2};
+  EXPECT_TRUE(column_types_equal(lhs, rhs));
+}
+
+TEST_F(ColumnTypeCheckTest, SameString)
+{
+  strings_column_wrapper lhs{{'a', 'a'}}, rhs{{'b'}};
+  EXPECT_TRUE(column_types_equal(lhs, rhs));
+
+  strings_column_wrapper lhs2{}, rhs2{{'b'}};
+  EXPECT_TRUE(column_types_equal(lhs2, rhs2));
+
+  strings_column_wrapper lhs3{}, rhs3{};
+  EXPECT_TRUE(column_types_equal(lhs3, rhs3));
+}
+
+TEST_F(ColumnTypeCheckTest, SameList)
+{
+  using LCW = lists_column_wrapper<int32_t>;
+
+  LCW lhs{}, rhs{};
+  EXPECT_TRUE(column_types_equal(lhs, rhs));
+
+  LCW lhs2{{1, 2, 3}}, rhs2{{4, 5}};
+  EXPECT_TRUE(column_types_equal(lhs2, rhs2));
+
+  LCW lhs3{{LCW{1}, LCW{2, 3}}}, rhs3{{LCW{4, 5}}};
+  EXPECT_TRUE(column_types_equal(lhs3, rhs3));
+
+  LCW lhs4{{LCW{1}, LCW{}, LCW{2, 3}}}, rhs4{{LCW{4, 5}, LCW{}}};
+  EXPECT_TRUE(column_types_equal(lhs4, rhs4));
+}
+
+TYPED_TEST(ColumnTypeCheckTestTyped, SameDictionary)
+{
+  using DCW = dictionary_column_wrapper<TypeParam>;
+  DCW lhs{1, 1, 2, 3}, rhs{5, 5};
+  EXPECT_TRUE(column_types_equal(lhs, rhs));
+
+  DCW lhs2{}, rhs2{};
+  EXPECT_TRUE(column_types_equal(lhs2, rhs2));
+}
+
+TEST_F(ColumnTypeCheckTest, SameStruct)
+{
+  using SCW      = structs_column_wrapper;
+  using FCW      = fixed_width_column_wrapper<int32_t>;
+  using StringCW = strings_column_wrapper;
+  using LCW      = lists_column_wrapper<int32_t>;
+  using DCW      = dictionary_column_wrapper<int32_t>;
+
+  FCW lf1{1, 2, 3}, rf1{0, 1};
+  StringCW lf2{"a", "bb", ""}, rf2{"cc", "d"};
+  LCW lf3{LCW{1, 2}, LCW{}, LCW{4}}, rf3{LCW{1}, LCW{2}};
+  DCW lf4{5, 5, 5}, rf4{9, 9};
+
+  SCW lhs{lf1, lf2, lf3, lf4}, rhs{rf1, rf2, rf3, rf4};
+  EXPECT_TRUE(column_types_equal(lhs, rhs));
+}
+
+TEST_F(ColumnTypeCheckTest, DifferentBasics)
+{
+  fixed_width_column_wrapper<int32_t> lhs1{1, 1};
+  strings_column_wrapper rhs1{"a", "bb"};
+
+  EXPECT_FALSE(column_types_equal(lhs1, rhs1));
+
+  lists_column_wrapper<string_view> lhs2{{"hello"}, {"world", "!"}};
+  strings_column_wrapper rhs2{"", "kk"};
+
+  EXPECT_FALSE(column_types_equal(lhs2, rhs2));
+
+  fixed_width_column_wrapper<int32_t> lhs3{1, 1};
+  dictionary_column_wrapper<int32_t> rhs3{2, 2};
+
+  EXPECT_FALSE(column_types_equal(lhs3, rhs3));
+
+  lists_column_wrapper<int32_t> lhs4{{8, 8, 8}, {10, 10}};
+  structs_column_wrapper rhs4{rhs2, rhs3};
+
+  EXPECT_FALSE(column_types_equal(lhs4, rhs4));
+}
+
+TEST_F(ColumnTypeCheckTest, DifferentFixedWidth)
+{
+  fixed_width_column_wrapper<int32_t> lhs1{1, 1};
+  fixed_width_column_wrapper<int64_t> rhs1{2};
+
+  EXPECT_FALSE(column_types_equal(lhs1, rhs1));
+
+  fixed_width_column_wrapper<float> lhs2{1, 1};
+  fixed_width_column_wrapper<double> rhs2{2};
+
+  EXPECT_FALSE(column_types_equal(lhs2, rhs2));
+
+  fixed_width_column_wrapper<timestamp_ms> lhs3{1, 1};
+  fixed_width_column_wrapper<timestamp_us> rhs3{2};
+
+  EXPECT_FALSE(column_types_equal(lhs3, rhs3));
+
+  fixed_width_column_wrapper<duration_D> lhs4{};
+  fixed_width_column_wrapper<duration_us> rhs4{42};
+
+  EXPECT_FALSE(column_types_equal(lhs4, rhs4));
+
+  // Same rep, different scale
+  fixed_point_column_wrapper<int32_t> lhs5({10000}, numeric::scale_type{-3});
+  fixed_point_column_wrapper<int32_t> rhs5({10000}, numeric::scale_type{0});
+
+  EXPECT_FALSE(column_types_equal(lhs5, rhs5));
+
+  // Different rep, same scale
+  fixed_point_column_wrapper<int32_t> lhs6({10000}, numeric::scale_type{-1});
+  fixed_point_column_wrapper<int64_t> rhs6({4200}, numeric::scale_type{-1});
+
+  EXPECT_FALSE(column_types_equal(lhs6, rhs6));
+}
+
+TEST_F(ColumnTypeCheckTest, DifferentDictionary)
+{
+  dictionary_column_wrapper<int32_t, uint32_t> lhs1{1, 1, 1, 2, 2, 3};
+  dictionary_column_wrapper<int64_t, uint32_t> rhs1{0, 0, 42, 42};
+
+  EXPECT_FALSE(column_types_equal(lhs1, rhs1));
+
+  dictionary_column_wrapper<double, uint32_t> lhs2{3.14, 3.14, 5.00};
+  dictionary_column_wrapper<int64_t, uint32_t> rhs2{0, 0, 42, 42};
+
+  EXPECT_FALSE(column_types_equal(lhs2, rhs2));
+
+  dictionary_column_wrapper<int32_t, uint32_t> lhs3{1, 1, 1, 2, 2, 3};
+  dictionary_column_wrapper<duration_s, uint32_t> rhs3{8, 8};
+
+  EXPECT_FALSE(column_types_equal(lhs3, rhs3));
+
+  dictionary_column_wrapper<int32_t, uint32_t> lhs4{1, 1, 2, 3}, rhs4{};
+  EXPECT_FALSE(column_types_equal(lhs4, rhs4));
+}
+
+TEST_F(ColumnTypeCheckTest, DifferentLists)
+{
+  using LCW_i = lists_column_wrapper<int32_t>;
+  using LCW_f = lists_column_wrapper<float>;
+
+  // Different nested level
+  LCW_i lhs1{LCW_i{1, 1, 2, 3}, LCW_i{}, LCW_i{42, 42}};
+  LCW_i rhs1{LCW_i{LCW_i{8, 8, 8}, LCW_i{9, 9}}, LCW_i{LCW_i{42, 42}}};
+
+  EXPECT_FALSE(column_types_equal(lhs1, rhs1));
+
+  // Different base column type
+  LCW_i lhs2{LCW_i{1, 1, 2, 3}, LCW_i{}, LCW_i{42, 42}};
+  LCW_f rhs2{LCW_f{9.0, 9.1}, LCW_f{3.14}, LCW_f{}};
+
+  EXPECT_FALSE(column_types_equal(lhs2, rhs2));
+}
+
+TEST_F(ColumnTypeCheckTest, DifferentStructs)
+{
+  fixed_width_column_wrapper<int32_t> lf1{1, 1, 1};
+  fixed_width_column_wrapper<int64_t> rf1{2, 2};
+
+  structs_column_wrapper lhs1{lf1};
+  structs_column_wrapper rhs1{rf1};
+
+  EXPECT_FALSE(column_types_equal(lhs1, rhs1));
+
+  fixed_width_column_wrapper<int32_t> lf2{1, 1, 1};
+  fixed_width_column_wrapper<int32_t> rf2{2, 2};
+
+  strings_column_wrapper lf3{"a", "b", "c"};
+
+  structs_column_wrapper lhs2{lf2, lf3};
+  structs_column_wrapper rhs2{rf2};
+
+  EXPECT_FALSE(column_types_equal(lhs2, rhs2));
+}
+
+}  // namespace test
+}  // namespace cudf
diff --git a/cpp/tests/utilities_tests/type_list_tests.cpp b/cpp/tests/utilities_tests/type_list_tests.cpp
index 0953c02078a..d0b10b774eb 100644
--- a/cpp/tests/utilities_tests/type_list_tests.cpp
+++ b/cpp/tests/utilities_tests/type_list_tests.cpp
@@ -60,7 +60,7 @@ template <typename T>
 std::string type_name()
 {
   int status;
-  char *realname;
+  char* realname;
   realname = abi::__cxa_demangle(typeid(T).name(), 0, 0, &status);
   std::string name{realname};
   free(realname);
@@ -86,15 +86,14 @@ TEST(TypeList, GetType)
 TEST(TypeList, Concat)
 {
   EXPECT_SAME_TYPE(Concat<>, Types<>);
-  EXPECT_SAME_TYPE((Concat<Types<long, void *, char *>>), (Types<long, void *, char *>));
+  EXPECT_SAME_TYPE((Concat<Types<long, void*, char*>>), (Types<long, void*, char*>));
 
-  EXPECT_SAME_TYPE((Concat<Types<long, void *, char *>, Types<float, char, double>>),
-                   (Types<long, void *, char *, float, char, double>));
+  EXPECT_SAME_TYPE((Concat<Types<long, void*, char*>, Types<float, char, double>>),
+                   (Types<long, void*, char*, float, char, double>));
 
-  EXPECT_SAME_TYPE((Concat<Types<long, void *, char *>,
-                           Types<float, char, double>,
-                           Types<int *, long *, unsigned>>),
-                   (Types<long, void *, char *, float, char, double, int *, long *, unsigned>));
+  EXPECT_SAME_TYPE(
+    (Concat<Types<long, void*, char*>, Types<float, char, double>, Types<int*, long*, unsigned>>),
+    (Types<long, void*, char*, float, char, double, int*, long*, unsigned>));
 }
 
 TEST(TypeList, Flatten)
diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py
index 5a4e651ab0f..474e302528e 100644
--- a/docs/cudf/source/conf.py
+++ b/docs/cudf/source/conf.py
@@ -23,7 +23,6 @@
 import sys
 
 from docutils.nodes import Text
-from recommonmark.transform import AutoStructify
 from sphinx.addnodes import pending_xref
 
 sys.path.insert(0, os.path.abspath("../.."))
@@ -48,7 +47,6 @@
     "IPython.sphinxext.ipython_console_highlighting",
     "IPython.sphinxext.ipython_directive",
     "nbsphinx",
-    "recommonmark",
     "PandasCompat",
 ]
 
@@ -206,9 +204,6 @@
 
 autoclass_content = "init"
 
-# Config AutoStructify
-github_doc_root = "https://github.com/rtfd/recommonmark/tree/master/doc/"
-
 # Replace API shorthands with fullname
 _reftarget_aliases = {
     "cudf.Series": ("cudf.core.series.Series", "cudf.Series"),
@@ -244,14 +239,5 @@ def setup(app):
     app.add_js_file("copybutton_pydocs.js")
     app.add_css_file("params.css")
     app.add_css_file("https://docs.rapids.ai/assets/css/custom.css")
-    app.add_config_value(
-        "recommonmark_config",
-        {
-            "url_resolver": lambda url: github_doc_root + url,
-            "auto_toc_tree_section": "Contents",
-        },
-        True,
-    )
-    app.add_transform(AutoStructify)
     app.connect("doctree-read", resolve_aliases)
     app.connect("missing-reference", ignore_internal_references)
diff --git a/docs/cudf/source/dask-cudf.md b/docs/cudf/source/dask-cudf.md
deleted file mode 100644
index 92ef4eb1c46..00000000000
--- a/docs/cudf/source/dask-cudf.md
+++ /dev/null
@@ -1,78 +0,0 @@
-Multi-GPU with Dask-cuDF
-========================
-
-cuDF is a single-GPU library. For Multi-GPU cuDF solutions we use [Dask](https://dask.org/) and the [dask-cudf package](https://github.com/rapidsai/cudf/tree/main/python/dask_cudf), which is able to scale cuDF across multiple GPUs on a single machine, or multiple GPUs across many machines in a cluster.
-
-[Dask DataFrame](http://docs.dask.org/en/latest/dataframe.html) was originally designed to scale Pandas, orchestrating many Pandas DataFrames spread across many CPUs into a cohesive parallel DataFrame. Because cuDF currently implements only a subset of Pandas’s API, not all Dask DataFrame operations work with cuDF.
-
-The following is tested and expected to work:
-
-What works
-----------
-
--  Data ingestion
-    -  ``dask_cudf.read_csv``
-    -  Use standard Dask ingestion with Pandas, then convert to cuDF (For
-      Parquet and other formats this is often decently fast)
--  Linear operations
-    -  Element-wise operations:  ``df.x + df.y``, ``df ** 2``
-    -  Assignment: ``df['z'] = df.x + df.y``
-    -  Row-wise selections:  ``df[df.x > 0]``
-    -  Loc:  ``df.loc['2001-01-01': '2005-02-02']``
-    -  Date time/string accessors:  ``df.timestamp.dt.dayofweek``
-    -  ... and most similar operations in this category that are already implemented in cuDF
--  Reductions
-    -  Like ``sum``, ``mean``, ``max``, ``count``, and so on on ``Series`` objects
-    -  <strike>Support for reductions on full dataframes</strike>
-    -  <strike>``std``</strike>
-    -  Custom reductions with [dask.dataframe.reduction](http://docs.dask.org/en/latest/dataframe-api.html#dask.dataframe.Series.reduction)
--  Groupby aggregations
-    -  On single columns: ``df.groupby('x').y.max()``
-    -  With custom aggregations:
-    -  <strike>groupby standard deviation</strike>
-    -  <strike>grouping on multiple columns</strike>
-    -  <strike>groupby agg for multiple outputs</strike>
--  Joins:
-    -  On full unsorted columns: ``left.merge(right, on='id')`` (expensive)
-    -  On sorted indexes: ``left.merge(right, left_index=True, right_index=True)`` (fast)
-    -  On large and small dataframes: ``left.merge(cudf_df, on='id')`` (fast)
--  <strike>Rolling operations</strike>
--  Converting to and from other forms
-    -  Dask + Pandas to Dask + cuDF ``df.map_partitions(cudf.from_pandas)``
-    -  Dask + cuDF to Dask + Pandas ``df.map_partitions(lambda df: df.to_pandas())``
-    -  cuDF to Dask + cuDF: ``dask.dataframe.from_pandas(df, npartitions=20)``
-    -  Dask + cuDF to cuDF: ``df.compute()``
-
-Additionally all generic Dask operations, like ``compute``, ``persist``,
-``visualize`` and so on work regardless.
-
-
-Developing the API
-------------------
-
-Above we mention the following:
-
-> and most similar operations in this category that are already implemented in cuDF
-
-This is because it is difficult to create a comprehensive list of operations in
-the cuDF and Pandas libraries.  The API is large enough to be difficult to track
-effectively.  For any operation that operates row-wise like ``fillna`` or
-``query`` things will likely, but not certainly work.  If operations don't work
-it is often due to a slight inconsistency between Pandas and cuDF that is
-generally easy to fix.  We encourage users to look at the [cuDF issue
-tracker](https://github.com/rapidsai/cudf/issues) to see if their issue has
-already been reported and, if not,
-[raise a new issue](https://github.com/rapidsai/cudf/issues/new).
-
-
-Navigating the API
-------------------
-
-This project reuses the
-[Dask DataFrame](https://docs.dask.org/en/latest/dataframe.html) project, which
-was originally designed for Pandas, with the newer library cuDF.  Because we use
-the same Dask classes for both projects there are often methods that are
-implemented for Pandas, but not yet for cuDF.  As a result users looking at the
-full Dask DataFrame API can be misleading, and often lead to frustration when
-operations that are advertised in the Dask API do not work as expected with
-cuDF.  We apologize for this in advance.
diff --git a/docs/cudf/source/dask-cudf.rst b/docs/cudf/source/dask-cudf.rst
new file mode 100644
index 00000000000..1c8e5ebda43
--- /dev/null
+++ b/docs/cudf/source/dask-cudf.rst
@@ -0,0 +1,107 @@
+Multi-GPU with Dask-cuDF
+========================
+
+cuDF is a single-GPU library. For Multi-GPU cuDF solutions we use
+`Dask <https://dask.org/>`__ and the `dask-cudf
+package <https://github.com/rapidsai/cudf/tree/main/python/dask_cudf>`__,
+which is able to scale cuDF across multiple GPUs on a single machine, or
+multiple GPUs across many machines in a cluster.
+
+`Dask DataFrame <http://docs.dask.org/en/latest/dataframe.html>`__ was
+originally designed to scale Pandas, orchestrating many Pandas
+DataFrames spread across many CPUs into a cohesive parallel DataFrame.
+Because cuDF currently implements only a subset of Pandas’s API, not all
+Dask DataFrame operations work with cuDF.
+
+The following is tested and expected to work:
+
+What works
+----------
+
+-  Data ingestion
+
+   -  ``dask_cudf.read_csv``
+   -  Use standard Dask ingestion with Pandas, then convert to cuDF (For
+      Parquet and other formats this is often decently fast)
+
+-  Linear operations
+
+   -  Element-wise operations: ``df.x + df.y``, ``df ** 2``
+   -  Assignment: ``df['z'] = df.x + df.y``
+   -  Row-wise selections: ``df[df.x > 0]``
+   -  Loc: ``df.loc['2001-01-01': '2005-02-02']``
+   -  Date time/string accessors: ``df.timestamp.dt.dayofweek``
+   -  ... and most similar operations in this category that are already
+      implemented in cuDF
+
+-  Reductions
+
+   -  Like ``sum``, ``mean``, ``max``, ``count``, and so on on
+      ``Series`` objects
+   -  Support for reductions on full dataframes
+   -  \ ``std``\ 
+   -  Custom reductions with
+      `dask.dataframe.reduction <http://docs.dask.org/en/latest/dataframe-api.html#dask.dataframe.Series.reduction>`__
+
+-  Groupby aggregations
+
+   -  On single columns: ``df.groupby('x').y.max()``
+   -  With custom aggregations:
+   -  groupby standard deviation
+   -  grouping on multiple columns
+   -  groupby agg for multiple outputs
+
+-  Joins:
+
+   -  On full unsorted columns: ``left.merge(right, on='id')``
+      (expensive)
+   -  On sorted indexes:
+      ``left.merge(right, left_index=True, right_index=True)`` (fast)
+   -  On large and small dataframes: ``left.merge(cudf_df, on='id')``
+      (fast)
+
+-  Rolling operations
+-  Converting to and from other forms
+
+   -  Dask + Pandas to Dask + cuDF
+      ``df.map_partitions(cudf.from_pandas)``
+   -  Dask + cuDF to Dask + Pandas
+      ``df.map_partitions(lambda df: df.to_pandas())``
+   -  cuDF to Dask + cuDF:
+      ``dask.dataframe.from_pandas(df, npartitions=20)``
+   -  Dask + cuDF to cuDF: ``df.compute()``
+
+Additionally all generic Dask operations, like ``compute``, ``persist``,
+``visualize`` and so on work regardless.
+
+Developing the API
+------------------
+
+Above we mention the following:
+
+    and most similar operations in this category that are already
+    implemented in cuDF
+
+This is because it is difficult to create a comprehensive list of
+operations in the cuDF and Pandas libraries. The API is large enough to
+be difficult to track effectively. For any operation that operates
+row-wise like ``fillna`` or ``query`` things will likely, but not
+certainly work. If operations don't work it is often due to a slight
+inconsistency between Pandas and cuDF that is generally easy to fix. We
+encourage users to look at the `cuDF issue
+tracker <https://github.com/rapidsai/cudf/issues>`__ to see if their
+issue has already been reported and, if not, `raise a new
+issue <https://github.com/rapidsai/cudf/issues/new>`__.
+
+Navigating the API
+------------------
+
+This project reuses the `Dask
+DataFrame <https://docs.dask.org/en/latest/dataframe.html>`__ project,
+which was originally designed for Pandas, with the newer library cuDF.
+Because we use the same Dask classes for both projects there are often
+methods that are implemented for Pandas, but not yet for cuDF. As a
+result users looking at the full Dask DataFrame API can be misleading,
+and often lead to frustration when operations that are advertised in the
+Dask API do not work as expected with cuDF. We apologize for this in
+advance.
diff --git a/docs/cudf/source/groupby.md b/docs/cudf/source/groupby.md
deleted file mode 100644
index 8a0e5dddba0..00000000000
--- a/docs/cudf/source/groupby.md
+++ /dev/null
@@ -1,200 +0,0 @@
-GroupBy
-=======
-
-cuDF supports a small (but important) subset of
-Pandas' [groupby API](https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html).
-
-## Summary of supported operations
-
-1. Grouping by one or more columns
-1. Basic aggregations such as "sum", "mean", etc.
-1. Quantile aggregation
-1. A "collect" or `list` aggregation for collecting values in a group into lists
-1. Automatic exclusion of columns with unsupported dtypes ("nuisance" columns) when aggregating
-1. Iterating over the groups of a GroupBy object
-1. `GroupBy.groups` API that returns a mapping of group keys to row labels
-1. `GroupBy.apply` API for performing arbitrary operations on each group. Note that
-   this has very limited functionality compared to the equivalent Pandas function.
-   See the section on [apply](#groupby-apply) for more details.
-1. `GroupBy.pipe` similar to [Pandas](https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html#piping-function-calls).
-
-## Grouping
-
-A GroupBy object is created by grouping the values of a `Series` or `DataFrame`
-by one or more columns:
-
-```python
-import cudf
-
->>> df = cudf.DataFrame({'a': [1, 1, 1, 2, 2], 'b': [1, 1, 2, 2, 3], 'c': [1, 2, 3, 4, 5]})
->>> df
->>> gb1 = df.groupby('a')  # grouping by a single column
->>> gb2 = df.groupby(['a', 'b'])  # grouping by multiple columns
->>> gb3 = df.groupby(cudf.Series(['a', 'a', 'b', 'b', 'b']))  # grouping by an external column
-```
-
-``` warning::
-   cuDF uses `sort=False` by default to achieve better performance, which provides no gaurentee to the group order in outputs. This deviates from Pandas default behavior.
-
-   For example:
-
-   .. code-block:: python
-   
-      >>> df = cudf.DataFrame({'a' : [2, 2, 1], 'b' : [42, 21, 11]})
-      >>> df.groupby('a').sum()
-         b
-      a    
-      2  63
-      1  11
-      >>> df.to_pandas().groupby('a').sum()
-         b
-      a    
-      1  11
-      2  63
-   
-   Setting `sort=True` will produce Pandas-like output, but with some performance penalty:
-
-   .. code-block:: python
-   
-      >>> df.groupby('a', sort=True).sum()
-         b
-      a    
-      1  11
-      2  63
-
-```
-
-### Grouping by index levels
-
-You can also group by one or more levels of a MultiIndex:
-
-```python
->>> df = cudf.DataFrame(
-...     {'a': [1, 1, 1, 2, 2], 'b': [1, 1, 2, 2, 3], 'c': [1, 2, 3, 4, 5]}
-... ).set_index(['a', 'b'])
-...
->>> df.groupby(level='a')
-```
-
-### The `Grouper` object
-
-A `Grouper` can be used to disambiguate between columns and levels when they have the same name:
-
-```python
->>> df
-   b  c
-b
-1  1  1
-1  1  2
-1  2  3
-2  2  4
-2  3  5
->>> df.groupby('b', level='b')  # ValueError: Cannot specify both by and level
->>> df.groupby([cudf.Grouper(key='b'), cudf.Grouper(level='b')])  # OK
-```
-
-## Aggregation
-
-Aggregations on groups is supported via the `agg` method:
-
-```python
->>> df
-   a  b  c
-0  1  1  1
-1  1  1  2
-2  1  2  3
-3  2  2  4
-4  2  3  5
->>> df.groupby('a').agg('sum')
-   b  c
-a
-1  4  6
-2  5  9
->>> df.groupby('a').agg({'b': ['sum', 'min'], 'c': 'mean'})
-    b        c
-  sum min mean
-a
-1   4   1  2.0
-2   5   2  4.5
-```
-
-The following table summarizes the available aggregations and the types that support them:
-
-| Aggregations\dtypes | Numeric  | Datetime | String   | Categorical | List | Struct | Interval | Decimal |
-| ------------------- | -------- | -------  | -------- | ----------- | ---- | ------ | -------- | ------- |
-| count               | ✅       | ✅       | ✅       | ✅          |      |        |          | ✅      |
-| size                | ✅       | ✅       | ✅       | ✅          |      |        |          | ✅      |
-| sum                 | ✅       | ✅       |          |             |      |        |          | ✅      |
-| idxmin              | ✅       | ✅       |          |             |      |        |          | ✅      |
-| idxmax              | ✅       | ✅       |          |             |      |        |          | ✅      |
-| min                 | ✅       | ✅       | ✅       |             |      |        |          | ✅      |
-| max                 | ✅       | ✅       | ✅       |             |      |        |          | ✅      |
-| mean                | ✅       | ✅       |          |             |      |        |          |         |
-| var                 | ✅       | ✅       |          |             |      |        |          |         |
-| std                 | ✅       | ✅       |          |             |      |        |          |         |
-| quantile            | ✅       | ✅       |          |             |      |        |          |         |
-| median              | ✅       | ✅       |          |             |      |        |          |         |
-| nunique             | ✅       | ✅       | ✅       | ✅          |      |        |          | ✅      |
-| nth                 | ✅       | ✅       | ✅       |             |      |        |          | ✅      |
-| collect             | ✅       | ✅       | ✅       |             | ✅   |        |          | ✅      |
-| unique              | ✅       | ✅       | ✅       | ✅          |      |        |          |         |
-
-## GroupBy apply
-
-To apply function on each group, use the `GroupBy.apply()` method:
-
-```python
->>> df
-   a  b  c
-0  1  1  1
-1  1  1  2
-2  1  2  3
-3  2  2  4
-4  2  3  5
->>> df.groupby('a').apply(lambda x: x.max() - x.min())
-   a  b  c
-a
-0  0  1  2
-1  0  1  1
-```
-
-### Limitations
-
-* `apply` works by applying the provided function to each group sequentially,
-  and concatenating the results together. **This can be very slow**, especially
-  for a large number of small groups. For a small number of large groups, it
-  can give acceptable performance
-
-* The results may not always match Pandas exactly. For example, cuDF may return
-  a `DataFrame` containing a single column where Pandas returns a `Series`.
-  Some post-processing may be required to match Pandas behavior.
-
-* cuDF does not support some of the exceptional cases that Pandas supports with
-  `apply`, such as [`describe`](https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html#flexible-apply).
-
-## Rolling window calculations
-
-Use the `GroupBy.rolling()` method to perform rolling window calculations on each group:
-
-```python
->>> df
-   a  b  c
-0  1  1  1
-1  1  1  2
-2  1  2  3
-3  2  2  4
-4  2  3  5
-```
-
-Rolling window sum on each group with a window size of 2:
-
-```python
->>> df.groupby('a').rolling(2).sum()
-        a     b     c
-a
-1 0  <NA>  <NA>  <NA>
-  1     2     2     3
-  2     2     3     5
-2 3  <NA>  <NA>  <NA>
-  4     4     5     9
-```
diff --git a/docs/cudf/source/groupby.rst b/docs/cudf/source/groupby.rst
new file mode 100644
index 00000000000..a6ce9db6817
--- /dev/null
+++ b/docs/cudf/source/groupby.rst
@@ -0,0 +1,237 @@
+GroupBy
+=======
+
+cuDF supports a small (but important) subset of Pandas' `groupby
+API <https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html>`__.
+
+Summary of supported operations
+-------------------------------
+
+1. Grouping by one or more columns
+2. Basic aggregations such as "sum", "mean", etc.
+3. Quantile aggregation
+4. A "collect" or ``list`` aggregation for collecting values in a group
+   into lists
+5. Automatic exclusion of columns with unsupported dtypes ("nuisance"
+   columns) when aggregating
+6. Iterating over the groups of a GroupBy object
+7. ``GroupBy.groups`` API that returns a mapping of group keys to row
+   labels
+8. ``GroupBy.apply`` API for performing arbitrary operations on each
+   group. Note that this has very limited functionality compared to the
+   equivalent Pandas function. See the section on
+   `apply <#groupby-apply>`__ for more details.
+9. ``GroupBy.pipe`` similar to
+   `Pandas <https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html#piping-function-calls>`__.
+
+Grouping
+--------
+
+A GroupBy object is created by grouping the values of a ``Series`` or
+``DataFrame`` by one or more columns:
+
+.. code:: python
+
+    import cudf
+
+    >>> df = cudf.DataFrame({'a': [1, 1, 1, 2, 2], 'b': [1, 1, 2, 2, 3], 'c': [1, 2, 3, 4, 5]})
+    >>> df
+    >>> gb1 = df.groupby('a')  # grouping by a single column
+    >>> gb2 = df.groupby(['a', 'b'])  # grouping by multiple columns
+    >>> gb3 = df.groupby(cudf.Series(['a', 'a', 'b', 'b', 'b']))  # grouping by an external column
+
+.. warning::
+
+       cuDF uses `sort=False` by default to achieve better performance, which provides no gaurentee to the group order in outputs. This deviates from Pandas default behavior.
+
+       For example:
+
+       .. code-block:: python
+       
+          >>> df = cudf.DataFrame({'a' : [2, 2, 1], 'b' : [42, 21, 11]})
+          >>> df.groupby('a').sum()
+             b
+          a    
+          2  63
+          1  11
+          >>> df.to_pandas().groupby('a').sum()
+             b
+          a    
+          1  11
+          2  63
+       
+       Setting `sort=True` will produce Pandas-like output, but with some performance penalty:
+
+       .. code-block:: python
+       
+          >>> df.groupby('a', sort=True).sum()
+             b
+          a    
+          1  11
+          2  63
+
+Grouping by index levels
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+You can also group by one or more levels of a MultiIndex:
+
+.. code:: python
+
+    >>> df = cudf.DataFrame(
+    ...     {'a': [1, 1, 1, 2, 2], 'b': [1, 1, 2, 2, 3], 'c': [1, 2, 3, 4, 5]}
+    ... ).set_index(['a', 'b'])
+    ...
+    >>> df.groupby(level='a')
+
+The ``Grouper`` object
+~~~~~~~~~~~~~~~~~~~~~~
+
+A ``Grouper`` can be used to disambiguate between columns and levels
+when they have the same name:
+
+.. code:: python
+
+    >>> df
+       b  c
+    b
+    1  1  1
+    1  1  2
+    1  2  3
+    2  2  4
+    2  3  5
+    >>> df.groupby('b', level='b')  # ValueError: Cannot specify both by and level
+    >>> df.groupby([cudf.Grouper(key='b'), cudf.Grouper(level='b')])  # OK
+
+Aggregation
+-----------
+
+Aggregations on groups is supported via the ``agg`` method:
+
+.. code:: python
+
+    >>> df
+       a  b  c
+    0  1  1  1
+    1  1  1  2
+    2  1  2  3
+    3  2  2  4
+    4  2  3  5
+    >>> df.groupby('a').agg('sum')
+       b  c
+    a
+    1  4  6
+    2  5  9
+    >>> df.groupby('a').agg({'b': ['sum', 'min'], 'c': 'mean'})
+        b        c
+      sum min mean
+    a
+    1   4   1  2.0
+    2   5   2  4.5
+
+The following table summarizes the available aggregations and the types
+that support them:
+
++------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
+| Aggregations / dtypes              | Numeric   | Datetime   | String   | Categorical   | List   | Struct   | Interval   | Decimal   |
++====================================+===========+============+==========+===============+========+==========+============+===========+
+| count                              | ✅        | ✅         | ✅       | ✅            |        |          |            | ✅        |
++------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
+| size                               | ✅        | ✅         | ✅       | ✅            |        |          |            | ✅        |
++------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
+| sum                                | ✅        | ✅         |          |               |        |          |            | ✅        |
++------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
+| idxmin                             | ✅        | ✅         |          |               |        |          |            | ✅        |
++------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
+| idxmax                             | ✅        | ✅         |          |               |        |          |            | ✅        |
++------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
+| min                                | ✅        | ✅         | ✅       |               |        |          |            | ✅        |
++------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
+| max                                | ✅        | ✅         | ✅       |               |        |          |            | ✅        |
++------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
+| mean                               | ✅        | ✅         |          |               |        |          |            |           |
++------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
+| var                                | ✅        | ✅         |          |               |        |          |            |           |
++------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
+| std                                | ✅        | ✅         |          |               |        |          |            |           |
++------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
+| quantile                           | ✅        | ✅         |          |               |        |          |            |           |
++------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
+| median                             | ✅        | ✅         |          |               |        |          |            |           |
++------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
+| nunique                            | ✅        | ✅         | ✅       | ✅            |        |          |            | ✅        |
++------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
+| nth                                | ✅        | ✅         | ✅       |               |        |          |            | ✅        |
++------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
+| collect                            | ✅        | ✅         | ✅       |               | ✅     |          |            | ✅        |
++------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
+| unique                             | ✅        | ✅         | ✅       | ✅            |        |          |            |           |
++------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
+
+GroupBy apply
+-------------
+
+To apply function on each group, use the ``GroupBy.apply()`` method:
+
+.. code:: python
+
+    >>> df
+       a  b  c
+    0  1  1  1
+    1  1  1  2
+    2  1  2  3
+    3  2  2  4
+    4  2  3  5
+    >>> df.groupby('a').apply(lambda x: x.max() - x.min())
+       a  b  c
+    a
+    0  0  1  2
+    1  0  1  1
+
+Limitations
+~~~~~~~~~~~
+
+-  ``apply`` works by applying the provided function to each group
+   sequentially, and concatenating the results together. **This can be
+   very slow**, especially for a large number of small groups. For a
+   small number of large groups, it can give acceptable performance
+
+-  The results may not always match Pandas exactly. For example, cuDF
+   may return a ``DataFrame`` containing a single column where Pandas
+   returns a ``Series``. Some post-processing may be required to match
+   Pandas behavior.
+
+-  cuDF does not support some of the exceptional cases that Pandas
+   supports with ``apply``, such as calling |describe|_ inside the
+   callable.
+
+ .. |describe| replace:: ``describe``
+ .. _describe: https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html#flexible-apply
+
+Rolling window calculations
+---------------------------
+
+Use the ``GroupBy.rolling()`` method to perform rolling window
+calculations on each group:
+
+.. code:: python
+
+    >>> df
+       a  b  c
+    0  1  1  1
+    1  1  1  2
+    2  1  2  3
+    3  2  2  4
+    4  2  3  5
+
+Rolling window sum on each group with a window size of 2:
+
+.. code:: python
+
+    >>> df.groupby('a').rolling(2).sum()
+            a     b     c
+    a
+    1 0  <NA>  <NA>  <NA>
+      1     2     2     3
+      2     2     3     5
+    2 3  <NA>  <NA>  <NA>
+      4     4     5     9
diff --git a/docs/cudf/source/index.rst b/docs/cudf/source/index.rst
index 85d6f795fe0..5a6d9a2617d 100644
--- a/docs/cudf/source/index.rst
+++ b/docs/cudf/source/index.rst
@@ -9,11 +9,11 @@ Welcome to cuDF's documentation!
    10min.ipynb
    basics.rst
    io.rst
-   groupby.md
-   dask-cudf.md
+   groupby.rst
+   dask-cudf.rst
    10min-cudf-cupy.ipynb
    guide-to-udfs.ipynb
-   internals.md
+   internals.rst
    Working-with-missing-data.ipynb
    PandasCompat.rst
 
diff --git a/docs/cudf/source/internals.md b/docs/cudf/source/internals.md
deleted file mode 100644
index d0a2a324c17..00000000000
--- a/docs/cudf/source/internals.md
+++ /dev/null
@@ -1,194 +0,0 @@
-cuDF internals
-==============
-
-The cuDF API closely matches that of the [Pandas](https://pandas.pydata.org/) library.
-Thus, we have the types `cudf.Series`, `cudf.DataFrame` and `cudf.Index` which look and
-feel very much like their Pandas counterparts.
-
-Under the hood, however, cuDF uses data structures very different from Pandas. In this document,
-we describe these internal data structures.
-
-## Column
-
-Columns are cuDF's core data structure and they are modeled after
-the [Apache Arrow Columnar Format](https://arrow.apache.org/docs/format/Columnar.html).
-
-A column represents a sequence of values, any number of which may be "null". Columns are
-specialized based on the type of data they contain. Thus we have `NumericalColumn`, `StringColumn`,
-`DatetimeColumn`, etc.,
-
-A column is composed of the following:
-
-* A **data type**, specifying the type of each element.
-* A **data buffer** that may store the data for the column elements.
-  Some column types do not have a data buffer, instead storing data in the children columns.
-* A **mask buffer** whose bits represent the validity (null or not null) of each element.
-  Columns whose elements are all "valid" may not have a mask buffer. Mask buffers are padded
-  to 64 bytes.
-* A tuple of **children** columns, which enable the representation complex types such as
-  columns with non-fixed width elements such as strings or lists.
-* A **size** indicating the number of elements in the column.
-* An integer **offset**: a column may represent a "slice" of another column,
-  in which case this offset represents the first element of the slice. The size of
-  the column then gives the extent of the slice. A column that is not a slice
-  has an offset of 0.
-
-For example, the `NumericalColumn` backing a Series with 1000 elements of type 'int32'
-and containing nulls is composed of:
-
-1. A data buffer of size 4000 bytes (sizeof(int32) * 1000)
-2. A mask buffer of size 128 bytes (1000/8 padded to a multiple of 64 bytes)
-3. No children columns
-
-As another example, the `StringColumn` backing the Series
-`['do', 'you', 'have', 'any', 'cheese?']` is composed of:
-
-1. No data buffer
-2. No mask buffer as there are no nulls in the Series
-3. Two children columns:
-   - A column of 8-bit characters `['d', 'o', 'y', 'o', 'u', h' ... '?']`
-   - A column of "offsets" to the characters column (in this case, `[0, 2, 5, 9, 12, 19]`)
-
-## Buffer
-
-The data and mask buffers of a column represent data in GPU memory (a.k.a *device memory*),
-and are object of type `cudf.core.buffer.Buffer`.
-
-Buffers can be constructed from array-like objects that live either on the host (e.g., numpy arrays)
-or the device (e.g., cupy arrays). Arrays must be of `uint8` dtype or viewed as such.
-
-When constructing a Buffer from a host object such as a numpy array, new device memory is allocated:
-
-```python
->>> from cudf.core.buffer import Buffer
->>> buf = Buffer(np.array([1, 2, 3], dtype='int64').view("uint8"))
->>> print(buf.ptr)  # address of new device memory allocation
-140050901762560
->>> print(buf.size)
-24
->>> print(buf._owner)
-<rmm._lib.device_buffer.DeviceBuffer object at 0x7f6055baab50>
-```
-
-cuDF uses the [RMM](https://github.com/rapidsai/rmm) library for allocating device memory.
-You can read more about device memory allocation with RMM
-[here](https://github.com/rapidsai/rmm#devicebuffers).
-
-When constructing a Buffer from a device object such as a CuPy array, no new device memory is
-allocated. Instead, the Buffer points to the existing allocation, keeping a reference to the device
-array:
-
-```python
->>> import cupy as cp
->>> c_ary = cp.asarray([1, 2, 3], dtype='int64')
->>> buf = Buffer(c_ary.view("uint8"))
->>> print(c_ary.data.mem.ptr)
-140050901762560
->>> print(buf.ptr)
-140050901762560
->>> print(buf.size)
-24
->>> print(buf._owner is c_ary)
-True
-```
-
-An uninitialized block of device memory can be allocated with `Buffer.empty`:
-
-```python
->>> buf = Buffer.empty(10)
->>> print(buf.size)
-10
->>> print(buf._owner)
-<rmm._lib.device_buffer.DeviceBuffer object at 0x7f6055baa890>
-```
-
-## ColumnAccessor
-
-cuDF  `Series`, `DataFrame` and `Index` are all subclasses of an internal `Frame` class.
-The underlying data structure of `Frame` is an ordered, dictionary-like object
-known as `ColumnAccessor`, which can be accessed via the `._data` attribute:
-
-```python
->>> a = cudf.DataFrame({'x': [1, 2, 3], 'y': ['a', 'b', 'c']})
->>> a._data
-ColumnAccessor(OrderedColumnDict([('x', <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d12e050>), ('y', <cudf.core.column.string.StringColumn object at 0x7f5a7d12e320>)]), multiindex=False, level_names=(None,))
-```
-
-ColumnAccessor is an ordered mapping of column labels to columns. In addition to behaving
-like an OrderedDict, it supports things like selecting multiple columns (both by index and label), as well as hierarchical indexing.
-
-```python
->>> from cudf.core.column_accessor import ColumnAccessor
-```
-
-The values of a ColumnAccessor are coerced to Columns during construction:
-
-```python
->>> ca = ColumnAccessor({'x': [1, 2, 3], 'y': ['a', 'b', 'c']})
->>> ca['x']
-<cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d5789e0>
->>> ca['y']
-<cudf.core.column.string.StringColumn object at 0x7f5a7d578b90>
->>> ca.pop('x')
-<cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d5789e0>
->>> ca
-ColumnAccessor(OrderedColumnDict([('y', <cudf.core.column.string.StringColumn object at 0x7f5a7d578b90>)]), multiindex=False, level_names=(None,))
-```
-
-Columns can be inserted at a specified location:
-
-```python
->>> ca.insert('z', [3, 4, 5], loc=1)
->>> ca
-ColumnAccessor(OrderedColumnDict([('x', <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d578dd0>), ('z', <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d578680>), ('y', <cudf.core.column.string.StringColumn object at 0x7f5a7d12e3b0>)]), multiindex=False, level_names=(None,))
-```
-
-Selecting columns by index:
-
-```python
->>> ca = ColumnAccessor({'x': [1, 2, 3], 'y': ['a', 'b', 'c'], 'z': [4, 5, 6]})
->>> ca.select_by_index(1)
-ColumnAccessor(OrderedColumnDict([('y', <cudf.core.column.string.StringColumn object at 0x7f5a7d578830>)]), multiindex=False, level_names=(None,))
->>> ca.select_by_index([0, 1])
-ColumnAccessor(OrderedColumnDict([('x', <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d5789e0>), ('y', <cudf.core.column.string.StringColumn object at 0x7f5a7d578830>)]), multiindex=False, level_names=(None,))    
->>> ca.select_by_index(slice(1, 3))
-ColumnAccessor(OrderedColumnDict([('y', <cudf.core.column.string.StringColumn object at 0x7f5a7d578830>), ('z', <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d5788c0>)]), multiindex=False, level_names=(None,))
-```
-
-Selecting columns by label:
-
-```python
->>> ca.select_by_label(['y', 'z'])
-ColumnAccessor(OrderedColumnDict([('y', <cudf.core.column.string.StringColumn object at 0x7f5a7d578830>), ('z', <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d5788c0>)]), multiindex=False, level_names=(None,))
->>> ca.select_by_label(slice('x', 'y'))
-ColumnAccessor(OrderedColumnDict([('x', <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d5789e0>), ('y', <cudf.core.column.string.StringColumn object at 0x7f5a7d578830>)]), multiindex=False, level_names=(None,))
-```
-
-A ColumnAccessor with tuple keys (and constructed with `multiindex=True`)
-can be hierarchically indexed:
-
-```python
->>> ca = ColumnAccessor({('a', 'b'): [1, 2, 3], ('a', 'c'): [2, 3, 4], 'b': [4, 5, 6]}, multiindex=True)
->>> ca.select_by_label('a')
-ColumnAccessor(OrderedColumnDict([('b', <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d5789e0>), ('c', <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d578dd0>)]), multiindex=False, level_names=(None,))
->>> ca.select_by_label(('a', 'b'))
-ColumnAccessor(OrderedColumnDict([(('a', 'b'), <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d5789e0>)]), multiindex=False, level_names=(None,))
-```
-
-"Wildcard" indexing is also allowed:
-
-```python
->>> ca = ColumnAccessor({('a', 'b'): [1, 2, 3], ('a', 'c'): [2, 3, 4], ('d', 'b'): [4, 5, 6]}, multiindex=True)
->>> ca.select_by_label((slice(None), 'b'))
-ColumnAccessor(OrderedColumnDict([(('a', 'b'), <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d578830>), (('d', 'b'), <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d578680>)]), multiindex=True, level_names=(None, None))
-```
-
-Finally, ColumnAccessors can convert to Pandas `Index` or `MultiIndex` objects:
-
-```python
->>> ca.to_pandas_index()
-MultiIndex([('a', 'b'),
-            ('a', 'c'),
-            ('d', 'b')],
-           )
-```
diff --git a/docs/cudf/source/internals.rst b/docs/cudf/source/internals.rst
new file mode 100644
index 00000000000..60b63c6fab8
--- /dev/null
+++ b/docs/cudf/source/internals.rst
@@ -0,0 +1,216 @@
+cuDF internals
+==============
+
+The cuDF API closely matches that of the
+`Pandas <https://pandas.pydata.org/>`__ library. Thus, we have the types
+``cudf.Series``, ``cudf.DataFrame`` and ``cudf.Index`` which look and
+feel very much like their Pandas counterparts.
+
+Under the hood, however, cuDF uses data structures very different from
+Pandas. In this document, we describe these internal data structures.
+
+Column
+------
+
+Columns are cuDF's core data structure and they are modeled after the
+`Apache Arrow Columnar
+Format <https://arrow.apache.org/docs/format/Columnar.html>`__.
+
+A column represents a sequence of values, any number of which may be
+"null". Columns are specialized based on the type of data they contain.
+Thus we have ``NumericalColumn``, ``StringColumn``, ``DatetimeColumn``,
+etc.,
+
+A column is composed of the following:
+
+-  A **data type**, specifying the type of each element.
+-  A **data buffer** that may store the data for the column elements.
+   Some column types do not have a data buffer, instead storing data in
+   the children columns.
+-  A **mask buffer** whose bits represent the validity (null or not
+   null) of each element. Columns whose elements are all "valid" may not
+   have a mask buffer. Mask buffers are padded to 64 bytes.
+-  A tuple of **children** columns, which enable the representation
+   complex types such as columns with non-fixed width elements such as
+   strings or lists.
+-  A **size** indicating the number of elements in the column.
+-  An integer **offset**: a column may represent a "slice" of another
+   column, in which case this offset represents the first element of the
+   slice. The size of the column then gives the extent of the slice. A
+   column that is not a slice has an offset of 0.
+
+For example, the ``NumericalColumn`` backing a Series with 1000 elements
+of type 'int32' and containing nulls is composed of:
+
+1. A data buffer of size 4000 bytes (sizeof(int32) \* 1000)
+2. A mask buffer of size 128 bytes (1000/8 padded to a multiple of 64
+   bytes)
+3. No children columns
+
+As another example, the ``StringColumn`` backing the Series
+``['do', 'you', 'have', 'any', 'cheese?']`` is composed of:
+
+1. No data buffer
+2. No mask buffer as there are no nulls in the Series
+3. Two children columns:
+
+    -  A column of 8-bit characters
+       ``['d', 'o', 'y', 'o', 'u', h' ... '?']``
+    -  A column of "offsets" to the characters column (in this case,
+       ``[0, 2, 5, 9, 12, 19]``)
+
+Buffer
+------
+
+The data and mask buffers of a column represent data in GPU memory
+(a.k.a *device memory*), and are object of type
+``cudf.core.buffer.Buffer``.
+
+Buffers can be constructed from array-like objects that live either on
+the host (e.g., numpy arrays) or the device (e.g., cupy arrays). Arrays
+must be of ``uint8`` dtype or viewed as such.
+
+When constructing a Buffer from a host object such as a numpy array, new
+device memory is allocated:
+
+.. code:: python
+
+    >>> from cudf.core.buffer import Buffer
+    >>> buf = Buffer(np.array([1, 2, 3], dtype='int64').view("uint8"))
+    >>> print(buf.ptr)  # address of new device memory allocation
+    140050901762560
+    >>> print(buf.size)
+    24
+    >>> print(buf._owner)
+    <rmm._lib.device_buffer.DeviceBuffer object at 0x7f6055baab50>
+
+cuDF uses the `RMM <https://github.com/rapidsai/rmm>`__ library for
+allocating device memory. You can read more about device memory
+allocation with RMM
+`here <https://github.com/rapidsai/rmm#devicebuffers>`__.
+
+When constructing a Buffer from a device object such as a CuPy array, no
+new device memory is allocated. Instead, the Buffer points to the
+existing allocation, keeping a reference to the device array:
+
+.. code:: python
+
+    >>> import cupy as cp
+    >>> c_ary = cp.asarray([1, 2, 3], dtype='int64')
+    >>> buf = Buffer(c_ary.view("uint8"))
+    >>> print(c_ary.data.mem.ptr)
+    140050901762560
+    >>> print(buf.ptr)
+    140050901762560
+    >>> print(buf.size)
+    24
+    >>> print(buf._owner is c_ary)
+    True
+
+An uninitialized block of device memory can be allocated with
+``Buffer.empty``:
+
+.. code:: python
+
+    >>> buf = Buffer.empty(10)
+    >>> print(buf.size)
+    10
+    >>> print(buf._owner)
+    <rmm._lib.device_buffer.DeviceBuffer object at 0x7f6055baa890>
+
+ColumnAccessor
+--------------
+
+cuDF ``Series``, ``DataFrame`` and ``Index`` are all subclasses of an
+internal ``Frame`` class. The underlying data structure of ``Frame`` is
+an ordered, dictionary-like object known as ``ColumnAccessor``, which
+can be accessed via the ``._data`` attribute:
+
+.. code:: python
+
+    >>> a = cudf.DataFrame({'x': [1, 2, 3], 'y': ['a', 'b', 'c']})
+    >>> a._data
+    ColumnAccessor(OrderedColumnDict([('x', <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d12e050>), ('y', <cudf.core.column.string.StringColumn object at 0x7f5a7d12e320>)]), multiindex=False, level_names=(None,))
+
+ColumnAccessor is an ordered mapping of column labels to columns. In
+addition to behaving like an OrderedDict, it supports things like
+selecting multiple columns (both by index and label), as well as
+hierarchical indexing.
+
+.. code:: python
+
+    >>> from cudf.core.column_accessor import ColumnAccessor
+
+The values of a ColumnAccessor are coerced to Columns during
+construction:
+
+.. code:: python
+
+    >>> ca = ColumnAccessor({'x': [1, 2, 3], 'y': ['a', 'b', 'c']})
+    >>> ca['x']
+    <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d5789e0>
+    >>> ca['y']
+    <cudf.core.column.string.StringColumn object at 0x7f5a7d578b90>
+    >>> ca.pop('x')
+    <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d5789e0>
+    >>> ca
+    ColumnAccessor(OrderedColumnDict([('y', <cudf.core.column.string.StringColumn object at 0x7f5a7d578b90>)]), multiindex=False, level_names=(None,))
+
+Columns can be inserted at a specified location:
+
+.. code:: python
+
+    >>> ca.insert('z', [3, 4, 5], loc=1)
+    >>> ca
+    ColumnAccessor(OrderedColumnDict([('x', <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d578dd0>), ('z', <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d578680>), ('y', <cudf.core.column.string.StringColumn object at 0x7f5a7d12e3b0>)]), multiindex=False, level_names=(None,))
+
+Selecting columns by index:
+
+.. code:: python
+
+    >>> ca = ColumnAccessor({'x': [1, 2, 3], 'y': ['a', 'b', 'c'], 'z': [4, 5, 6]})
+    >>> ca.select_by_index(1)
+    ColumnAccessor(OrderedColumnDict([('y', <cudf.core.column.string.StringColumn object at 0x7f5a7d578830>)]), multiindex=False, level_names=(None,))
+    >>> ca.select_by_index([0, 1])
+    ColumnAccessor(OrderedColumnDict([('x', <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d5789e0>), ('y', <cudf.core.column.string.StringColumn object at 0x7f5a7d578830>)]), multiindex=False, level_names=(None,))    
+    >>> ca.select_by_index(slice(1, 3))
+    ColumnAccessor(OrderedColumnDict([('y', <cudf.core.column.string.StringColumn object at 0x7f5a7d578830>), ('z', <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d5788c0>)]), multiindex=False, level_names=(None,))
+
+Selecting columns by label:
+
+.. code:: python
+
+    >>> ca.select_by_label(['y', 'z'])
+    ColumnAccessor(OrderedColumnDict([('y', <cudf.core.column.string.StringColumn object at 0x7f5a7d578830>), ('z', <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d5788c0>)]), multiindex=False, level_names=(None,))
+    >>> ca.select_by_label(slice('x', 'y'))
+    ColumnAccessor(OrderedColumnDict([('x', <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d5789e0>), ('y', <cudf.core.column.string.StringColumn object at 0x7f5a7d578830>)]), multiindex=False, level_names=(None,))
+
+A ColumnAccessor with tuple keys (and constructed with
+``multiindex=True``) can be hierarchically indexed:
+
+.. code:: python
+
+    >>> ca = ColumnAccessor({('a', 'b'): [1, 2, 3], ('a', 'c'): [2, 3, 4], 'b': [4, 5, 6]}, multiindex=True)
+    >>> ca.select_by_label('a')
+    ColumnAccessor(OrderedColumnDict([('b', <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d5789e0>), ('c', <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d578dd0>)]), multiindex=False, level_names=(None,))
+    >>> ca.select_by_label(('a', 'b'))
+    ColumnAccessor(OrderedColumnDict([(('a', 'b'), <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d5789e0>)]), multiindex=False, level_names=(None,))
+
+"Wildcard" indexing is also allowed:
+
+.. code:: python
+
+    >>> ca = ColumnAccessor({('a', 'b'): [1, 2, 3], ('a', 'c'): [2, 3, 4], ('d', 'b'): [4, 5, 6]}, multiindex=True)
+    >>> ca.select_by_label((slice(None), 'b'))
+    ColumnAccessor(OrderedColumnDict([(('a', 'b'), <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d578830>), (('d', 'b'), <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d578680>)]), multiindex=True, level_names=(None, None))
+
+Finally, ColumnAccessors can convert to Pandas ``Index`` or
+``MultiIndex`` objects:
+
+.. code:: python
+
+    >>> ca.to_pandas_index()
+    MultiIndex([('a', 'b'),
+                ('a', 'c'),
+                ('d', 'b')],
+               )
diff --git a/java/ci/Dockerfile.centos7 b/java/ci/Dockerfile.centos7
index 79f18632416..a6264a84696 100644
--- a/java/ci/Dockerfile.centos7
+++ b/java/ci/Dockerfile.centos7
@@ -31,14 +31,16 @@ RUN yum install -y git zlib-devel maven tar wget patch
 ## pre-create the CMAKE_INSTALL_PREFIX folder, set writable by any user for Jenkins
 RUN mkdir /usr/local/rapids && mkdir /rapids && chmod 777 /usr/local/rapids && chmod 777 /rapids
 
-RUN cd /usr/local/ && wget --quiet https://github.com/Kitware/CMake/releases/download/v3.20.5/cmake-3.20.5-linux-x86_64.tar.gz && \
-   tar zxf cmake-3.20.5-linux-x86_64.tar.gz && \
-   rm cmake-3.20.5-linux-x86_64.tar.gz
-ENV PATH /usr/local/cmake-3.20.5-linux-x86_64/bin:$PATH
+ARG CMAKE_VERSION=3.20.5
+RUN cd /usr/local/ && wget --quiet https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz && \
+   tar zxf cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz && \
+   rm cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz
+ENV PATH /usr/local/cmake-${CMAKE_VERSION}-linux-x86_64/bin:$PATH
 
 # get GDS user-space lib
-RUN cd /tmp/ && wget https://developer.download.nvidia.com/gds/redist/rel-0.95.1/gds-redistrib-0.95.1.tgz && \
-    tar zxf gds-redistrib-0.95.1.tgz && \
-    cp -R ./gds-redistrib-0.95.1/targets/x86_64-linux/lib/* /usr/local/cuda/targets/x86_64-linux/lib && \
-    cp -R ./gds-redistrib-0.95.1/targets/x86_64-linux/include/* /usr/local/cuda/targets/x86_64-linux/include && \
-    rm -rf gds-redistrib-0.95.1*
+ARG GDS_VERSION=1.0.0
+RUN cd /tmp/ && wget https://developer.download.nvidia.com/gds/redist/rel-${GDS_VERSION}/gds-redistrib-${GDS_VERSION}.tgz && \
+    tar zxf gds-redistrib-${GDS_VERSION}.tgz && \
+    cp -R ./gds-redistrib-${GDS_VERSION}/targets/x86_64-linux/lib/* /usr/local/cuda/targets/x86_64-linux/lib && \
+    cp -R ./gds-redistrib-${GDS_VERSION}/targets/x86_64-linux/include/* /usr/local/cuda/targets/x86_64-linux/include && \
+    rm -rf gds-redistrib-*
diff --git a/java/src/main/java/ai/rapids/cudf/BinaryOp.java b/java/src/main/java/ai/rapids/cudf/BinaryOp.java
index b6d3293205a..8b58d8383b4 100644
--- a/java/src/main/java/ai/rapids/cudf/BinaryOp.java
+++ b/java/src/main/java/ai/rapids/cudf/BinaryOp.java
@@ -18,7 +18,7 @@
 import java.util.EnumSet;
 
 /**
- * Mathematical unary operations.
+ * Mathematical binary operations.
  */
 public enum BinaryOp {
   ADD(0),
@@ -28,30 +28,29 @@ public enum BinaryOp {
   TRUE_DIV(4), // divide after promoting to FLOAT64 point
   FLOOR_DIV(5), // divide after promoting to FLOAT64 and flooring the result
   MOD(6),
-  PYMOD(7), // mod operator % follow by python's sign rules for negatives
-  POW(8),
-  EQUAL(9),
-  NOT_EQUAL(10),
-  LESS(11),
-  GREATER(12),
-  LESS_EQUAL(13), // <=
-  GREATER_EQUAL(14), // >=
+  PMOD(7), // pmod
+  PYMOD(8), // mod operator % follow by python's sign rules for negatives
+  POW(9),
+  LOG_BASE(10), // logarithm to the base
+  ATAN2(11), // atan2
+  SHIFT_LEFT(12), // bitwise shift left (<<)
+  SHIFT_RIGHT(13), // bitwise shift right (>>)
+  SHIFT_RIGHT_UNSIGNED(14), // bitwise shift right (>>>)
   BITWISE_AND(15),
   BITWISE_OR(16),
   BITWISE_XOR(17),
   LOGICAL_AND(18),
   LOGICAL_OR(19),
-  //NOT IMPLEMENTED YET COALESCE(20); // x == null ? y : x
-  //NOT IMPLEMENTED YET GENERIC_BINARY(21);
-  SHIFT_LEFT(22), // bitwise shift left (<<)
-  SHIFT_RIGHT(23), // bitwise shift right (>>)
-  SHIFT_RIGHT_UNSIGNED(24), // bitwise shift right (>>>)
-  LOG_BASE(25), // logarithm to the base
-  ATAN2(26), // atan2
-  PMOD(27), // pmod
-  NULL_EQUALS(28), // like EQUAL but NULL == NULL is TRUE and NULL == not NULL is FALSE
-  NULL_MAX(29), // MAX but NULL < not NULL
-  NULL_MIN(30); // MIN but NULL > not NULL
+  EQUAL(20),
+  NOT_EQUAL(21),
+  LESS(22),
+  GREATER(23),
+  LESS_EQUAL(24), // <=
+  GREATER_EQUAL(25), // >=
+  NULL_EQUALS(26), // like EQUAL but NULL == NULL is TRUE and NULL == not NULL is FALSE
+  NULL_MAX(27), // MAX but NULL < not NULL
+  NULL_MIN(28); // MIN but NULL > not NULL
+  //NOT IMPLEMENTED YET GENERIC_BINARY(29);
 
 
   static final EnumSet<BinaryOp> COMPARISON = EnumSet.of(
diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java
index 7912a525597..7299a6a716b 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnView.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java
@@ -1497,6 +1497,33 @@ public final ColumnVector toTitle() {
     assert type.equals(DType.STRING);
     return new ColumnVector(title(getNativeView()));
   }
+
+  /**
+   * Returns a column of capitalized strings.
+   *
+   * If the `delimiters` is an empty string, then only the first character of each
+   * row is capitalized. Otherwise, a non-delimiter character is capitalized after
+   * any delimiter character is found.
+   *
+   * Example:
+   *     input = ["tesT1", "a Test", "Another Test", "a\tb"];
+   *     delimiters = ""
+   *     output is ["Test1", "A test", "Another test", "A\tb"]
+   *     delimiters = " "
+   *     output is ["Test1", "A Test", "Another Test", "A\tb"]
+   *
+   * Any null string entries return corresponding null output column entries.
+   *
+   * @param delimiters Used if identifying words to capitalize. Should not be null.
+   * @return a column of capitalized strings. Users should close the returned column.
+   */
+  public final ColumnVector capitalize(Scalar delimiters) {
+    if (DType.STRING.equals(type) && DType.STRING.equals(delimiters.getType())) {
+      return new ColumnVector(capitalize(getNativeView(), delimiters.getScalarHandle()));
+    }
+    throw new IllegalArgumentException("Both input column and delimiters scalar should be" +
+        " string type. But got column: " + type + ", scalar: " + delimiters.getType());
+  }
   /////////////////////////////////////////////////////////////////////////////
   // TYPE CAST
   /////////////////////////////////////////////////////////////////////////////
@@ -3322,6 +3349,8 @@ private static native long clamper(long nativeView, long loScalarHandle, long lo
 
   protected static native long title(long handle);
 
+  private static native long capitalize(long strsColHandle, long delimitersHandle);
+
   private static native long makeStructView(long[] handles, long rowCount);
 
   private static native long isTimestamp(long nativeView, String format);
diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt
index 84b44b546a3..e779a74290d 100755
--- a/java/src/main/native/CMakeLists.txt
+++ b/java/src/main/native/CMakeLists.txt
@@ -173,7 +173,6 @@ message(STATUS "RMM: RMM_INCLUDE set to ${RMM_INCLUDE}")
 
 find_path(ARROW_INCLUDE "arrow"
           HINTS "$ENV{ARROW_ROOT}/include"
-                "$ENV{CONDA_PREFIX}/include"
                 "${CUDF_CPP_BUILD_DIR}/_deps/arrow-src/cpp/src")
 
 message(STATUS "ARROW: ARROW_INCLUDE set to ${ARROW_INCLUDE}")
@@ -187,7 +186,6 @@ endif(CUDF_JNI_ARROW_STATIC)
 
 find_library(ARROW_LIBRARY ${CUDF_JNI_ARROW_LIBNAME} REQUIRED
   HINTS "$ENV{ARROW_ROOT}/lib"
-        "$ENV{CONDA_PREFIX}/lib"
         "${CUDF_CPP_BUILD_DIR}/_deps/arrow-build/release")
 
 if(NOT ARROW_LIBRARY)
diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp
index 44ac3a91c77..38c38d853ac 100644
--- a/java/src/main/native/src/ColumnViewJni.cpp
+++ b/java/src/main/native/src/ColumnViewJni.cpp
@@ -1793,6 +1793,23 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_title(JNIEnv *env, jobjec
   CATCH_STD(env, 0);
 }
 
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_capitalize(JNIEnv *env, jobject j_object,
+                                                                  jlong strs_handle,
+                                                                  jlong delimiters_handle) {
+
+  JNI_NULL_CHECK(env, strs_handle, "native view handle is null", 0)
+  JNI_NULL_CHECK(env, delimiters_handle, "delimiters scalar handle is null", 0)
+
+  try {
+    cudf::jni::auto_set_device(env);
+    cudf::column_view *view = reinterpret_cast<cudf::column_view *>(strs_handle);
+    cudf::string_scalar *deli = reinterpret_cast<cudf::string_scalar *>(delimiters_handle);
+    std::unique_ptr<cudf::column> result = cudf::strings::capitalize(*view, *deli);
+    return reinterpret_cast<jlong>(result.release());
+  }
+  CATCH_STD(env, 0);
+}
+
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_makeStructView(JNIEnv *env, jobject j_object,
                                                                       jlongArray handles,
                                                                       jlong row_count) {
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index 018dd211139..790403d7594 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -237,7 +237,7 @@ class native_arrow_ipc_writer_handle final {
       }
 
       // There is an option to have a file writer too, with metadata
-      auto tmp_writer = arrow::ipc::NewStreamWriter(sink.get(), arrow_tab->schema());
+      auto tmp_writer = arrow::ipc::MakeStreamWriter(sink, arrow_tab->schema());
       if (!tmp_writer.ok()) {
         throw std::runtime_error(tmp_writer.status().message());
       }
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index a121309d8aa..753deceb59d 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -4458,6 +4458,31 @@ void testStringTitlize() {
     }
   }
 
+  @Test
+  void testStringCapitalize() {
+    try (ColumnVector cv = ColumnVector.fromStrings("s Park", "S\nqL", "lower \tcase",
+                                                    null, "", "UPPER\rCASE")) {
+      try (Scalar deli = Scalar.fromString("");
+           ColumnVector result = cv.capitalize(deli);
+           ColumnVector expected = ColumnVector.fromStrings("S park", "S\nql", "Lower \tcase",
+                                                            null, "", "Upper\rcase")) {
+        assertColumnsAreEqual(expected, result);
+      }
+      try (Scalar deli = Scalar.fromString(" ");
+           ColumnVector result = cv.capitalize(deli);
+           ColumnVector expected = ColumnVector.fromStrings("S Park", "S\nql", "Lower \tcase",
+                                                            null, "", "Upper\rcase")) {
+        assertColumnsAreEqual(expected, result);
+      }
+      try (Scalar deli = Scalar.fromString(" \t\n");
+           ColumnVector result = cv.capitalize(deli);
+           ColumnVector expected = ColumnVector.fromStrings("S Park", "S\nQl", "Lower \tCase",
+                                                             null, "", "Upper\rcase")) {
+        assertColumnsAreEqual(expected, result);
+      }
+    }
+  }
+
   @Test
   void testNansToNulls() {
     Float[] floats = new Float[]{1.2f, Float.POSITIVE_INFINITY, Float.NEGATIVE_INFINITY, null,
diff --git a/java/src/test/java/ai/rapids/cudf/CudaTest.java b/java/src/test/java/ai/rapids/cudf/CudaTest.java
index d33785e0577..8905c2edd56 100644
--- a/java/src/test/java/ai/rapids/cudf/CudaTest.java
+++ b/java/src/test/java/ai/rapids/cudf/CudaTest.java
@@ -24,8 +24,11 @@ public class CudaTest {
 
   @Test
   public void testGetCudaRuntimeInfo() {
-    assert Cuda.getDriverVersion() >= Cuda.getRuntimeVersion();
-    assert Cuda.getRuntimeVersion() > 1000;
+    // The driver version is not necessarily larger than runtime version. Drivers of previous
+    // version are also able to support runtime of later version, only if they support same
+    // kinds of computeModes.
+    assert Cuda.getDriverVersion() >= 1000;
+    assert Cuda.getRuntimeVersion() >= 1000;
     assertEquals(Cuda.getNativeComputeMode(), Cuda.getComputeMode().nativeId);
   }
 
diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py
index b26d95e7951..84e612c1cbe 100644
--- a/python/cudf/cudf/__init__.py
+++ b/python/cudf/cudf/__init__.py
@@ -47,6 +47,7 @@
 from cudf.core.dtypes import (
     CategoricalDtype,
     Decimal64Dtype,
+    Decimal32Dtype,
     IntervalDtype,
     ListDtype,
     StructDtype,
diff --git a/python/cudf/cudf/_fuzz_testing/avro.py b/python/cudf/cudf/_fuzz_testing/avro.py
index a07e3acf416..4c167ac627f 100644
--- a/python/cudf/cudf/_fuzz_testing/avro.py
+++ b/python/cudf/cudf/_fuzz_testing/avro.py
@@ -15,7 +15,7 @@
     pandas_to_avro,
     pyarrow_to_pandas,
 )
-from cudf.tests import dataset_generator as dg
+from cudf.testing import dataset_generator as dg
 
 logging.basicConfig(
     format="%(asctime)s %(levelname)-8s %(message)s",
diff --git a/python/cudf/cudf/_fuzz_testing/csv.py b/python/cudf/cudf/_fuzz_testing/csv.py
index 84346ed61ad..0acb9c8a471 100644
--- a/python/cudf/cudf/_fuzz_testing/csv.py
+++ b/python/cudf/cudf/_fuzz_testing/csv.py
@@ -12,7 +12,7 @@
     _generate_rand_meta,
     pyarrow_to_pandas,
 )
-from cudf.tests import dataset_generator as dg
+from cudf.testing import dataset_generator as dg
 from cudf.utils.dtypes import pandas_dtypes_to_cudf_dtypes
 
 logging.basicConfig(
diff --git a/python/cudf/cudf/_fuzz_testing/json.py b/python/cudf/cudf/_fuzz_testing/json.py
index 5ecb27f7665..df9226cf059 100644
--- a/python/cudf/cudf/_fuzz_testing/json.py
+++ b/python/cudf/cudf/_fuzz_testing/json.py
@@ -13,7 +13,7 @@
     _generate_rand_meta,
     pyarrow_to_pandas,
 )
-from cudf.tests import dataset_generator as dg
+from cudf.testing import dataset_generator as dg
 from cudf.utils.dtypes import pandas_dtypes_to_cudf_dtypes
 
 logging.basicConfig(
diff --git a/python/cudf/cudf/_fuzz_testing/orc.py b/python/cudf/cudf/_fuzz_testing/orc.py
index 607294a49c9..2aa01eb3967 100644
--- a/python/cudf/cudf/_fuzz_testing/orc.py
+++ b/python/cudf/cudf/_fuzz_testing/orc.py
@@ -16,7 +16,7 @@
     pandas_to_orc,
     pyarrow_to_pandas,
 )
-from cudf.tests import dataset_generator as dg
+from cudf.testing import dataset_generator as dg
 
 logging.basicConfig(
     format="%(asctime)s %(levelname)-8s %(message)s",
diff --git a/python/cudf/cudf/_fuzz_testing/parquet.py b/python/cudf/cudf/_fuzz_testing/parquet.py
index 8c63b12d972..5b00f96d88d 100644
--- a/python/cudf/cudf/_fuzz_testing/parquet.py
+++ b/python/cudf/cudf/_fuzz_testing/parquet.py
@@ -12,7 +12,7 @@
     _generate_rand_meta,
     pyarrow_to_pandas,
 )
-from cudf.tests import dataset_generator as dg
+from cudf.testing import dataset_generator as dg
 
 logging.basicConfig(
     format="%(asctime)s %(levelname)-8s %(message)s",
diff --git a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_csv.py b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_csv.py
index e6a5d081980..9b6abeb1276 100644
--- a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_csv.py
+++ b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_csv.py
@@ -13,7 +13,7 @@
     compare_content,
     run_test,
 )
-from cudf.tests.utils import assert_eq
+from cudf.testing._utils import assert_eq
 
 
 @pythonfuzz(data_handle=CSVReader)
diff --git a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_json.py b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_json.py
index f3da03f447b..2f5e6204f7c 100644
--- a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_json.py
+++ b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_json.py
@@ -9,7 +9,7 @@
 from cudf._fuzz_testing.json import JSONReader, JSONWriter
 from cudf._fuzz_testing.main import pythonfuzz
 from cudf._fuzz_testing.utils import ALL_POSSIBLE_VALUES, run_test
-from cudf.tests.utils import assert_eq
+from cudf.testing._utils import assert_eq
 
 
 @pythonfuzz(data_handle=JSONReader)
diff --git a/python/cudf/cudf/_fuzz_testing/utils.py b/python/cudf/cudf/_fuzz_testing/utils.py
index 71b5a35a225..fe9ed4d4934 100644
--- a/python/cudf/cudf/_fuzz_testing/utils.py
+++ b/python/cudf/cudf/_fuzz_testing/utils.py
@@ -9,7 +9,7 @@
 import pyorc
 
 import cudf
-from cudf.tests.utils import assert_eq
+from cudf.testing._utils import assert_eq
 from cudf.utils.dtypes import (
     pandas_dtypes_to_cudf_dtypes,
     pyarrow_dtypes_to_pandas_dtypes,
diff --git a/python/cudf/cudf/_lib/binaryop.pyx b/python/cudf/cudf/_lib/binaryop.pyx
index 5eaec640b15..70a9aa2165f 100644
--- a/python/cudf/cudf/_lib/binaryop.pyx
+++ b/python/cudf/cudf/_lib/binaryop.pyx
@@ -87,9 +87,6 @@ class BinaryOperation(IntEnum):
     L_OR = (
         <underlying_type_t_binary_operator> binary_operator.LOGICAL_OR
     )
-    COALESCE = (
-        <underlying_type_t_binary_operator> binary_operator.COALESCE
-    )
     GENERIC_BINARY = (
         <underlying_type_t_binary_operator> binary_operator.GENERIC_BINARY
     )
diff --git a/python/cudf/cudf/_lib/copying.pxd b/python/cudf/cudf/_lib/copying.pxd
new file mode 100644
index 00000000000..1668ef05f3f
--- /dev/null
+++ b/python/cudf/cudf/_lib/copying.pxd
@@ -0,0 +1,14 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+
+from cudf._lib.table cimport Table
+
+from cudf._lib.cpp.copying cimport packed_columns
+
+cdef class _CPackedColumns:
+    cdef packed_columns c_obj
+    cdef object column_names
+    cdef object column_dtypes
+    cdef object index_names
+
+    @staticmethod
+    cdef _CPackedColumns from_py_table(Table input_table, keep_index=*)
diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx
index 463082f0687..9ad552a0acb 100644
--- a/python/cudf/cudf/_lib/copying.pyx
+++ b/python/cudf/cudf/_lib/copying.pyx
@@ -1,12 +1,16 @@
 # Copyright (c) 2020-2021, NVIDIA CORPORATION.
 
+import pickle
+
 import pandas as pd
 
 from libcpp cimport bool
 from libcpp.memory cimport make_unique, unique_ptr, shared_ptr, make_shared
 from libcpp.vector cimport vector
 from libcpp.utility cimport move
-from libc.stdint cimport int32_t, int64_t
+from libc.stdint cimport int32_t, int64_t, uint8_t, uintptr_t
+
+from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer
 
 from cudf._lib.column cimport Column
 from cudf._lib.scalar import as_device_scalar
@@ -14,6 +18,8 @@ from cudf._lib.scalar cimport DeviceScalar
 from cudf._lib.table cimport Table
 from cudf._lib.reduce import minmax
 
+from cudf.core.abc import Serializable
+
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport (
     column_view,
@@ -776,3 +782,164 @@ def segmented_gather(Column source_column, Column gather_map):
 
     result = Column.from_unique_ptr(move(c_result))
     return result
+
+
+cdef class _CPackedColumns:
+
+    @staticmethod
+    cdef _CPackedColumns from_py_table(Table input_table, keep_index=True):
+        """
+        Construct a ``PackedColumns`` object from a ``cudf.DataFrame``.
+        """
+        from cudf.core import RangeIndex, dtypes
+
+        cdef _CPackedColumns p = _CPackedColumns.__new__(_CPackedColumns)
+
+        if keep_index and not input_table.index.equals(
+            RangeIndex(start=0, stop=len(input_table), step=1)
+        ):
+            input_table_view = input_table.view()
+            p.index_names = input_table._index_names
+        else:
+            input_table_view = input_table.data_view()
+
+        p.column_names = input_table._column_names
+        p.column_dtypes = {}
+        for name, col in input_table._data.items():
+            if isinstance(col.dtype, dtypes._BaseDtype):
+                p.column_dtypes[name] = col.dtype
+
+        p.c_obj = move(cpp_copying.pack(input_table_view))
+
+        return p
+
+    @property
+    def gpu_data_ptr(self):
+        return int(<uintptr_t>self.c_obj.gpu_data.get()[0].data())
+
+    @property
+    def gpu_data_size(self):
+        return int(<size_t>self.c_obj.gpu_data.get()[0].size())
+
+    def serialize(self):
+        header = {}
+        frames = []
+
+        header["column-names"] = self.column_names
+        header["index-names"] = self.index_names
+        header["gpu-data-ptr"] = self.gpu_data_ptr
+        header["gpu-data-size"] = self.gpu_data_size
+        header["metadata"] = list(
+            <uint8_t[:self.c_obj.metadata_.get()[0].size()]>
+            self.c_obj.metadata_.get()[0].data()
+        )
+
+        column_dtypes = {}
+        for name, dtype in self.column_dtypes.items():
+            dtype_header, dtype_frames = dtype.serialize()
+            column_dtypes[name] = (
+                dtype_header,
+                (len(frames), len(frames) + len(dtype_frames)),
+            )
+            frames.extend(dtype_frames)
+        header["column-dtypes"] = column_dtypes
+
+        return header, frames
+
+    @staticmethod
+    def deserialize(header, frames):
+        cdef _CPackedColumns p = _CPackedColumns.__new__(_CPackedColumns)
+
+        dbuf = DeviceBuffer(
+            ptr=header["gpu-data-ptr"],
+            size=header["gpu-data-size"]
+        )
+
+        cdef cpp_copying.packed_columns data
+        data.metadata_ = move(
+            make_unique[cpp_copying.metadata](
+                move(<vector[uint8_t]>header["metadata"])
+            )
+        )
+        data.gpu_data = move(dbuf.c_obj)
+
+        p.c_obj = move(data)
+        p.column_names = header["column-names"]
+        p.index_names = header["index-names"]
+
+        column_dtypes = {}
+        for name, dtype in header["column-dtypes"].items():
+            dtype_header, (start, stop) = dtype
+            column_dtypes[name] = pickle.loads(
+                dtype_header["type-serialized"]
+            ).deserialize(dtype_header, frames[start:stop])
+        p.column_dtypes = column_dtypes
+
+        return p
+
+    def unpack(self):
+        output_table = Table.from_table_view(
+            cpp_copying.unpack(self.c_obj),
+            self,
+            self.column_names,
+            self.index_names
+        )
+
+        for name, dtype in self.column_dtypes.items():
+            output_table._data[name] = (
+                output_table._data[name]._with_type_metadata(dtype)
+            )
+
+        return output_table
+
+
+class PackedColumns(Serializable):
+    """
+    A packed representation of a ``cudf.Table``, with all columns residing
+    in a single GPU memory buffer.
+    """
+
+    def __init__(self, data):
+        self._data = data
+
+    def __reduce__(self):
+        return self.deserialize, self.serialize()
+
+    @property
+    def __cuda_array_interface__(self):
+        return {
+            "data": (self._data.gpu_data_ptr, False),
+            "shape": (self._data.gpu_data_size,),
+            "strides": None,
+            "typestr": "|u1",
+            "version": 0
+        }
+
+    def serialize(self):
+        return self._data.serialize()
+
+    @classmethod
+    def deserialize(cls, header, frames):
+        return cls(_CPackedColumns.deserialize(header, frames))
+
+    @classmethod
+    def from_py_table(cls, input_table, keep_index=True):
+        return cls(_CPackedColumns.from_py_table(input_table, keep_index))
+
+    def unpack(self):
+        return self._data.unpack()
+
+
+def pack(input_table, keep_index=True):
+    """
+    Pack the columns of a ``cudf.Table`` into a single GPU memory buffer.
+    """
+    return PackedColumns.from_py_table(input_table, keep_index)
+
+
+def unpack(packed):
+    """
+    Unpack the results of packing a ``cudf.Table``, returning a new
+    ``Table`` in the process.
+    """
+    return packed.unpack()
diff --git a/python/cudf/cudf/_lib/cpp/binaryop.pxd b/python/cudf/cudf/_lib/cpp/binaryop.pxd
index 2e36070a164..7a9c7347b6a 100644
--- a/python/cudf/cudf/_lib/cpp/binaryop.pxd
+++ b/python/cudf/cudf/_lib/cpp/binaryop.pxd
@@ -33,7 +33,6 @@ cdef extern from "cudf/binaryop.hpp" namespace "cudf" nogil:
         BITWISE_XOR "cudf::binary_operator::BITWISE_XOR"
         LOGICAL_AND "cudf::binary_operator::LOGICAL_AND"
         LOGICAL_OR "cudf::binary_operator::LOGICAL_OR"
-        COALESCE "cudf::binary_operator::COALESCE"
         GENERIC_BINARY "cudf::binary_operator::GENERIC_BINARY"
 
     cdef unique_ptr[column] binary_operation (
diff --git a/python/cudf/cudf/_lib/cpp/copying.pxd b/python/cudf/cudf/_lib/cpp/copying.pxd
index c32eb13d908..1f24f51e9a9 100644
--- a/python/cudf/cudf/_lib/cpp/copying.pxd
+++ b/python/cudf/cudf/_lib/cpp/copying.pxd
@@ -3,7 +3,7 @@
 from rmm._lib.device_buffer cimport device_buffer
 
 from libcpp cimport bool
-from libc.stdint cimport int32_t, int64_t
+from libc.stdint cimport int32_t, int64_t, uint8_t
 from libcpp.memory cimport unique_ptr
 from libcpp.vector cimport vector
 
@@ -20,6 +20,12 @@ from cudf._lib.cpp.types cimport size_type
 
 ctypedef const scalar constscalar
 
+cdef extern from "cudf/copying.hpp" namespace "cudf::packed_columns" nogil:
+    cdef struct metadata:
+        metadata(vector[uint8_t]&& v)
+        const uint8_t* data () except +
+        size_type size () except +
+
 cdef extern from "cudf/copying.hpp" namespace "cudf" nogil:
     ctypedef enum out_of_bounds_policy:
         NULLIFY 'cudf::out_of_bounds_policy::NULLIFY'
@@ -119,6 +125,10 @@ cdef extern from "cudf/copying.hpp" namespace "cudf" nogil:
         vector[size_type] splits
     ) except +
 
+    cdef struct packed_columns:
+        unique_ptr[metadata] metadata_
+        unique_ptr[device_buffer] gpu_data
+
     cdef struct contiguous_split_result:
         table_view table
         vector[device_buffer] all_data
@@ -128,6 +138,10 @@ cdef extern from "cudf/copying.hpp" namespace "cudf" nogil:
         vector[size_type] splits
     ) except +
 
+    cdef packed_columns pack (const table_view& input) except +
+
+    cdef table_view unpack (const packed_columns& input) except +
+
     cdef unique_ptr[column] copy_if_else (
         column_view lhs,
         column_view rhs,
diff --git a/python/cudf/cudf/_lib/cpp/datetime.pxd b/python/cudf/cudf/_lib/cpp/datetime.pxd
index 20fdd2e842a..f662bfb93f2 100644
--- a/python/cudf/cudf/_lib/cpp/datetime.pxd
+++ b/python/cudf/cudf/_lib/cpp/datetime.pxd
@@ -16,3 +16,4 @@ cdef extern from "cudf/datetime.hpp" namespace "cudf::datetime" nogil:
         const column_view& timestamps,
         const column_view& months
     ) except +
+    cdef unique_ptr[column] day_of_year(const column_view& column) except +
diff --git a/python/cudf/cudf/_lib/cpp/lists/combine.pxd b/python/cudf/cudf/_lib/cpp/lists/combine.pxd
index ea9ade178e2..164253e39b5 100644
--- a/python/cudf/cudf/_lib/cpp/lists/combine.pxd
+++ b/python/cudf/cudf/_lib/cpp/lists/combine.pxd
@@ -3,10 +3,26 @@
 from libcpp.memory cimport unique_ptr
 
 from cudf._lib.cpp.column.column cimport column
+from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.table.table_view cimport table_view
 
 cdef extern from "cudf/lists/combine.hpp" namespace \
         "cudf::lists" nogil:
+
+    ctypedef enum concatenate_null_policy:
+        IGNORE "cudf::lists::concatenate_null_policy::IGNORE"
+        NULLIFY_OUTPUT_ROW \
+            "cudf::lists::concatenate_null_policy::NULLIFY_OUTPUT_ROW"
+
     cdef unique_ptr[column] concatenate_rows(
         const table_view input_table
     ) except +
+
+    cdef unique_ptr[column] concatenate_list_elements(
+        const table_view input_table,
+    ) except +
+
+    cdef unique_ptr[column] concatenate_list_elements(
+        const column_view input_table,
+        concatenate_null_policy null_policy
+    ) except +
diff --git a/python/cudf/cudf/_lib/cpp/scalar/scalar.pxd b/python/cudf/cudf/_lib/cpp/scalar/scalar.pxd
index 72dcd654232..771ec9100d1 100644
--- a/python/cudf/cudf/_lib/cpp/scalar/scalar.pxd
+++ b/python/cudf/cudf/_lib/cpp/scalar/scalar.pxd
@@ -10,7 +10,7 @@ from cudf._lib.cpp.types cimport data_type
 from cudf._lib.cpp.wrappers.decimals cimport scale_type
 
 from cudf._lib.cpp.column.column_view cimport column_view
-
+from cudf._lib.cpp.table.table_view cimport table_view
 
 cdef extern from "cudf/scalar/scalar.hpp" namespace "cudf" nogil:
     cdef cppclass scalar:
@@ -66,4 +66,9 @@ cdef extern from "cudf/scalar/scalar.hpp" namespace "cudf" nogil:
 
     cdef cppclass list_scalar(scalar):
         list_scalar(column_view col) except +
+        list_scalar(column_view col, bool is_valid) except +
         column_view view() except +
+
+    cdef cppclass struct_scalar(scalar):
+        struct_scalar(table_view cols, bool valid) except +
+        table_view view() except +
diff --git a/python/cudf/cudf/_lib/datetime.pyx b/python/cudf/cudf/_lib/datetime.pyx
index 3e40cb62f9c..09be55abe9d 100644
--- a/python/cudf/cudf/_lib/datetime.pyx
+++ b/python/cudf/cudf/_lib/datetime.pyx
@@ -46,6 +46,8 @@ def extract_datetime_component(Column col, object field):
             c_result = move(libcudf_datetime.extract_minute(col_view))
         elif field == "second":
             c_result = move(libcudf_datetime.extract_second(col_view))
+        elif field == "day_of_year":
+            c_result = move(libcudf_datetime.day_of_year(col_view))
         else:
             raise ValueError(f"Invalid datetime field: '{field}'")
 
diff --git a/python/cudf/cudf/_lib/gpuarrow.pyx b/python/cudf/cudf/_lib/gpuarrow.pyx
index 6513cd59424..a7da22637b9 100644
--- a/python/cudf/cudf/_lib/gpuarrow.pyx
+++ b/python/cudf/cudf/_lib/gpuarrow.pyx
@@ -15,7 +15,7 @@ from pyarrow.includes.libarrow cimport (
     CRecordBatchStreamReader
 )
 from pyarrow.lib cimport (
-    _CRecordBatchReader,
+    RecordBatchReader,
     Buffer,
     Schema,
     pyarrow_wrap_schema
@@ -23,7 +23,7 @@ from pyarrow.lib cimport (
 import pyarrow as pa
 
 
-cdef class CudaRecordBatchStreamReader(_CRecordBatchReader):
+cdef class CudaRecordBatchStreamReader(RecordBatchReader):
     cdef:
         CIpcReadOptions options
 
diff --git a/python/cudf/cudf/_lib/io/utils.pxd b/python/cudf/cudf/_lib/io/utils.pxd
index 0a793b2d018..589d48db812 100644
--- a/python/cudf/cudf/_lib/io/utils.pxd
+++ b/python/cudf/cudf/_lib/io/utils.pxd
@@ -1,8 +1,18 @@
 # Copyright (c) 2020, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
+from libcpp.vector cimport vector
 
-from cudf._lib.cpp.io.types cimport source_info, sink_info, data_sink
+from cudf._lib.cpp.io.types cimport (
+    source_info,
+    sink_info,
+    data_sink,
+    column_name_info
+)
+from cudf._lib.table cimport Table
 
 cdef source_info make_source_info(list src) except*
 cdef sink_info make_sink_info(src, unique_ptr[data_sink] & data) except*
+cdef update_struct_field_names(
+    Table table,
+    vector[column_name_info]& schema_info)
diff --git a/python/cudf/cudf/_lib/io/utils.pyx b/python/cudf/cudf/_lib/io/utils.pyx
index 6598a7af626..44951c59525 100644
--- a/python/cudf/cudf/_lib/io/utils.pyx
+++ b/python/cudf/cudf/_lib/io/utils.pyx
@@ -8,10 +8,18 @@ from libcpp.utility cimport move
 from libcpp.vector cimport vector
 from libcpp.pair cimport pair
 from libcpp.string cimport string
+from cudf._lib.column cimport Column
 from cudf._lib.cpp.io.types cimport source_info, io_type, host_buffer
-from cudf._lib.cpp.io.types cimport sink_info, data_sink, datasource
+from cudf._lib.cpp.io.types cimport (
+    sink_info,
+    data_sink,
+    datasource,
+    column_name_info,
+)
 from cudf._lib.io.datasource cimport Datasource
 
+from cudf.utils.dtypes import is_struct_dtype
+
 import codecs
 import errno
 import io
@@ -108,3 +116,38 @@ cdef cppclass iobase_data_sink(data_sink):
 
     size_t bytes_written() with gil:
         return buf.tell()
+
+
+cdef update_struct_field_names(
+    Table table,
+    vector[column_name_info]& schema_info
+):
+    for i, (name, col) in enumerate(table._data.items()):
+        table._data[name] = _update_column_struct_field_names(
+            col, schema_info[i]
+        )
+
+
+cdef Column _update_column_struct_field_names(
+    Column col,
+    column_name_info& info
+):
+    cdef vector[string] field_names
+
+    if is_struct_dtype(col):
+        field_names.reserve(len(col.base_children))
+        for i in range(info.children.size()):
+            field_names.push_back(info.children[i].name)
+        col = col._rename_fields(
+            field_names
+        )
+
+    if col.children:
+        children = list(col.children)
+        for i, child in enumerate(children):
+            children[i] = _update_column_struct_field_names(
+                child,
+                info.children[i]
+            )
+        col.set_base_children(tuple(children))
+    return col
diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx
index 7d8909610dc..9fd7d7611ae 100644
--- a/python/cudf/cudf/_lib/lists.pyx
+++ b/python/cudf/cudf/_lib/lists.pyx
@@ -17,8 +17,11 @@ from cudf._lib.cpp.lists.sorting cimport (
     sort_lists as cpp_sort_lists
 )
 from cudf._lib.cpp.lists.combine cimport (
-    concatenate_rows as cpp_concatenate_rows
+    concatenate_rows as cpp_concatenate_rows,
+    concatenate_null_policy,
+    concatenate_list_elements as cpp_concatenate_list_elements
 )
+
 from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
 from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.column.column cimport column
@@ -181,3 +184,20 @@ def concatenate_rows(Table tbl):
 
     result = Column.from_unique_ptr(move(c_result))
     return result
+
+
+def concatenate_list_elements(Column input_column, dropna=False):
+    cdef concatenate_null_policy policy = (
+        concatenate_null_policy.IGNORE if dropna
+        else concatenate_null_policy.NULLIFY_OUTPUT_ROW
+    )
+    cdef column_view c_input = input_column.view()
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(cpp_concatenate_list_elements(
+            c_input,
+            policy
+        ))
+
+    return Column.from_unique_ptr(move(c_result))
diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx
index 6b00f0e0173..944300cc167 100644
--- a/python/cudf/cudf/_lib/orc.pyx
+++ b/python/cudf/cudf/_lib/orc.pyx
@@ -9,6 +9,10 @@ from libcpp.vector cimport vector
 from libcpp.utility cimport move
 from cudf._lib.cpp.column.column cimport column
 
+from cudf.utils.dtypes import is_struct_dtype
+
+from cudf._lib.column cimport Column
+
 from cudf._lib.cpp.io.orc_metadata cimport (
     raw_orc_statistics,
     read_raw_orc_statistics as libcudf_read_raw_orc_statistics
@@ -22,6 +26,7 @@ from cudf._lib.cpp.io.orc cimport (
     orc_chunked_writer
 )
 from cudf._lib.cpp.io.types cimport (
+    column_name_info,
     compression_type,
     data_sink,
     sink_info,
@@ -36,7 +41,11 @@ from cudf._lib.cpp.types cimport (
     data_type, type_id, size_type
 )
 
-from cudf._lib.io.utils cimport make_source_info, make_sink_info
+from cudf._lib.io.utils cimport (
+    make_source_info,
+    make_sink_info,
+    update_struct_field_names,
+)
 from cudf._lib.table cimport Table
 from cudf._lib.types import np_to_cudf_types
 from cudf._lib.types cimport underlying_type_t_type_id
@@ -106,7 +115,11 @@ cpdef read_orc(object filepaths_or_buffers,
 
     names = [name.decode() for name in c_result.metadata.column_names]
 
-    return Table.from_unique_ptr(move(c_result.tbl), names)
+    tbl = Table.from_unique_ptr(move(c_result.tbl), names)
+
+    update_struct_field_names(tbl, c_result.metadata.schema_info)
+
+    return tbl
 
 
 cdef compression_type _get_comp_type(object compression):
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index 4ea2adec23a..3208b175c6e 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -61,7 +61,8 @@ from cudf._lib.cpp.io.parquet cimport (
 from cudf._lib.column cimport Column
 from cudf._lib.io.utils cimport (
     make_source_info,
-    make_sink_info
+    make_sink_info,
+    update_struct_field_names,
 )
 
 cimport cudf._lib.cpp.types as cudf_types
@@ -187,7 +188,7 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
         )
     )
 
-    _update_struct_field_names(df, c_out_table.metadata.schema_info)
+    update_struct_field_names(df, c_out_table.metadata.schema_info)
 
     if df.empty and meta is not None:
         cols_dtype_map = {}
@@ -519,39 +520,6 @@ cdef cudf_io_types.compression_type _get_comp_type(object compression):
         raise ValueError("Unsupported `compression` type")
 
 
-cdef _update_struct_field_names(
-    Table table,
-    vector[cudf_io_types.column_name_info]& schema_info
-):
-    for i, (name, col) in enumerate(table._data.items()):
-        table._data[name] = _update_column_struct_field_names(
-            col, schema_info[i]
-        )
-
-cdef Column _update_column_struct_field_names(
-    Column col,
-    cudf_io_types.column_name_info& info
-):
-    cdef vector[string] field_names
-
-    if is_struct_dtype(col):
-        field_names.reserve(len(col.base_children))
-        for i in range(info.children.size()):
-            field_names.push_back(info.children[i].name)
-        col = col._rename_fields(
-            field_names
-        )
-
-    if col.children:
-        children = list(col.children)
-        for i, child in enumerate(children):
-            children[i] = _update_column_struct_field_names(
-                child,
-                info.children[i]
-            )
-        col.set_base_children(tuple(children))
-    return col
-
 cdef _set_col_metadata(Column col, column_in_metadata& col_meta):
     if is_struct_dtype(col):
         for i, (child_col, name) in enumerate(
diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx
index c771afe568a..9e50f42d625 100644
--- a/python/cudf/cudf/_lib/scalar.pyx
+++ b/python/cudf/cudf/_lib/scalar.pyx
@@ -18,7 +18,7 @@ from libcpp.utility cimport move
 from libcpp cimport bool
 
 import cudf
-from cudf.core.dtypes import ListDtype
+from cudf.core.dtypes import ListDtype, StructDtype
 from cudf._lib.types import (
     cudf_to_np_types,
     duration_unit_map
@@ -28,8 +28,9 @@ from cudf._lib.types cimport underlying_type_t_type_id, dtype_from_column_view
 
 from cudf._lib.column cimport Column
 from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.table cimport Table
-from cudf._lib.interop import to_arrow
+from cudf._lib.interop import to_arrow, from_arrow
 
 from cudf._lib.cpp.wrappers.timestamps cimport (
     timestamp_s,
@@ -52,8 +53,9 @@ from cudf._lib.cpp.scalar.scalar cimport (
     string_scalar,
     fixed_point_scalar,
     list_scalar,
+    struct_scalar
 )
-from cudf.utils.dtypes import _decimal_to_int64, is_list_dtype
+from cudf.utils.dtypes import _decimal_to_int64, is_list_dtype, is_struct_dtype
 cimport cudf._lib.cpp.types as libcudf_types
 
 cdef class DeviceScalar:
@@ -85,6 +87,8 @@ cdef class DeviceScalar:
         elif isinstance(dtype, cudf.ListDtype):
             _set_list_from_pylist(
                 self.c_value, value, dtype, valid)
+        elif isinstance(dtype, cudf.StructDtype):
+            _set_struct_from_pydict(self.c_value, value, dtype, valid)
         elif pd.api.types.is_string_dtype(dtype):
             _set_string_from_np_string(self.c_value, value, valid)
         elif pd.api.types.is_numeric_dtype(dtype):
@@ -109,6 +113,8 @@ cdef class DeviceScalar:
     def _to_host_scalar(self):
         if isinstance(self.dtype, cudf.Decimal64Dtype):
             result = _get_py_decimal_from_fixed_point(self.c_value)
+        elif is_struct_dtype(self.dtype):
+            result = _get_py_dict_from_struct(self.c_value)
         elif is_list_dtype(self.dtype):
             result = _get_py_list_from_list(self.c_value)
         elif pd.api.types.is_string_dtype(self.dtype):
@@ -168,11 +174,16 @@ cdef class DeviceScalar:
 
         s.c_value = move(ptr)
         cdtype = s.get_raw_ptr()[0].type()
-
         if cdtype.id() == libcudf_types.DECIMAL64 and dtype is None:
             raise TypeError(
                 "Must pass a dtype when constructing from a fixed-point scalar"
             )
+        elif cdtype.id() == libcudf_types.STRUCT:
+            struct_table_view = (<struct_scalar*>s.get_raw_ptr())[0].view()
+            s._dtype = StructDtype({
+                str(i): dtype_from_column_view(struct_table_view.column(i))
+                for i in range(struct_table_view.num_columns())
+            })
         elif cdtype.id() == libcudf_types.LIST:
             if (
                 <list_scalar*>s.get_raw_ptr()
@@ -298,6 +309,53 @@ cdef _set_decimal64_from_scalar(unique_ptr[scalar]& s,
         )
     )
 
+cdef _set_struct_from_pydict(unique_ptr[scalar]& s,
+                             object value,
+                             object dtype,
+                             bool valid=True):
+    arrow_schema = dtype.to_arrow()
+    columns = [str(i) for i in range(len(arrow_schema))]
+    if valid:
+        pyarrow_table = pa.Table.from_arrays(
+            [
+                pa.array([value[f.name]], from_pandas=True, type=f.type)
+                for f in arrow_schema
+            ],
+            names=columns
+        )
+    else:
+        pyarrow_table = pa.Table.from_arrays(
+            [
+                pa.array([], from_pandas=True, type=f.type)
+                for f in arrow_schema
+            ],
+            names=columns
+        )
+
+    cdef Table table = from_arrow(pyarrow_table, column_names=columns)
+    cdef table_view struct_view = table.view()
+
+    s.reset(
+        new struct_scalar(struct_view, valid)
+    )
+
+cdef _get_py_dict_from_struct(unique_ptr[scalar]& s):
+    if not s.get()[0].is_valid():
+        return cudf.NA
+
+    cdef table_view struct_table_view = (<struct_scalar*>s.get()).view()
+    columns = [str(i) for i in range(struct_table_view.num_columns())]
+
+    cdef Table to_arrow_table = Table.from_table_view(
+        struct_table_view,
+        None,
+        column_names=columns
+    )
+
+    python_dict = to_arrow(to_arrow_table, columns).to_pydict()
+
+    return {k: _nested_na_replace(python_dict[k])[0] for k in python_dict}
+
 cdef _set_list_from_pylist(unique_ptr[scalar]& s,
                            object value,
                            object dtype,
@@ -306,20 +364,18 @@ cdef _set_list_from_pylist(unique_ptr[scalar]& s,
     value = value if valid else [cudf.NA]
     cdef Column col
     if isinstance(dtype.element_type, ListDtype):
-        col = cudf.core.column.as_column(
-            pa.array(
-                value, from_pandas=True, type=dtype.element_type.to_arrow()
-            )
-        )
+        pa_type = dtype.element_type.to_arrow()
     else:
-        col = cudf.core.column.as_column(
-            pa.array(value, from_pandas=True)
-        )
+        pa_type = dtype.to_arrow().value_type
+    col = cudf.core.column.as_column(
+        pa.array(value, from_pandas=True, type=pa_type)
+    )
     cdef column_view col_view = col.view()
     s.reset(
-        new list_scalar(col_view)
+        new list_scalar(col_view, valid)
     )
 
+
 cdef _get_py_list_from_list(unique_ptr[scalar]& s):
 
     if not s.get()[0].is_valid():
@@ -333,6 +389,7 @@ cdef _get_py_list_from_list(unique_ptr[scalar]& s):
     result = arrow_table['col'].to_pylist()
     return _nested_na_replace(result)
 
+
 cdef _get_py_string_from_string(unique_ptr[scalar]& s):
     if not s.get()[0].is_valid():
         return cudf.NA
@@ -469,18 +526,16 @@ cdef _get_np_scalar_from_timedelta64(unique_ptr[scalar]& s):
 
 
 def as_device_scalar(val, dtype=None):
-    if dtype:
-        if isinstance(val, (cudf.Scalar, DeviceScalar)) and dtype != val.dtype:
-            raise TypeError("Can't update dtype of existing GPU scalar")
+    if isinstance(val, (cudf.Scalar, DeviceScalar)):
+        if dtype == val.dtype or dtype is None:
+            if isinstance(val, DeviceScalar):
+                return val
+            else:
+                return val.device_value
         else:
-            return cudf.Scalar(value=val, dtype=dtype).device_value
+            raise TypeError("Can't update dtype of existing GPU scalar")
     else:
-        if isinstance(val, DeviceScalar):
-            return val
-        if isinstance(val, cudf.Scalar):
-            return val.device_value
-        else:
-            return cudf.Scalar(val).device_value
+        return cudf.Scalar(val, dtype=dtype).device_value
 
 
 def _is_null_host_scalar(slr):
diff --git a/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx b/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx
index 38d238b8266..e002d630fc3 100644
--- a/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx
+++ b/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx
@@ -28,7 +28,7 @@ from libcpp.string cimport string
 
 def from_decimal(Column input_col):
     """
-    Converts a `DecimalColumn` to a `StringColumn`.
+    Converts a `Decimal64Column` to a `StringColumn`.
 
     Parameters
     ----------
@@ -50,7 +50,7 @@ def from_decimal(Column input_col):
 
 def to_decimal(Column input_col, object out_type):
     """
-    Returns a `DecimalColumn` from the provided `StringColumn`
+    Returns a `Decimal64Column` from the provided `StringColumn`
     using the scale in the `out_type`.
 
     Parameters
diff --git a/python/cudf/cudf/_lib/types.pyx b/python/cudf/cudf/_lib/types.pyx
index e9ed4f21ddd..43e5c213947 100644
--- a/python/cudf/cudf/_lib/types.pyx
+++ b/python/cudf/cudf/_lib/types.pyx
@@ -14,9 +14,19 @@ from cudf._lib.types cimport (
 )
 from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
-from cudf.core.dtypes import ListDtype, StructDtype, Decimal64Dtype
-from cudf.utils.dtypes import is_decimal_dtype, is_list_dtype, is_struct_dtype
-
+from cudf.core.dtypes import (
+    ListDtype,
+    StructDtype,
+    Decimal64Dtype,
+    Decimal32Dtype
+)
+from cudf.utils.dtypes import (
+    is_decimal_dtype,
+    is_list_dtype,
+    is_struct_dtype,
+    is_decimal64_dtype,
+    is_decimal32_dtype
+)
 cimport cudf._lib.cpp.types as libcudf_types
 
 
@@ -191,10 +201,6 @@ cdef dtype_from_structs_column_view(column_view cv):
     }
     return StructDtype(fields)
 
-cdef dtype_from_decimal_column_view(column_view cv):
-    scale = -cv.type().scale()
-    return Decimal64Dtype(precision=Decimal64Dtype.MAX_PRECISION, scale=scale)
-
 cdef dtype_from_column_view(column_view cv):
     cdef libcudf_types.type_id tid = cv.type().id()
     if tid == libcudf_types.type_id.LIST:
@@ -202,10 +208,15 @@ cdef dtype_from_column_view(column_view cv):
     elif tid == libcudf_types.type_id.STRUCT:
         return dtype_from_structs_column_view(cv)
     elif tid == libcudf_types.type_id.DECIMAL64:
-        return dtype_from_decimal_column_view(cv)
+        return Decimal64Dtype(
+            precision=Decimal64Dtype.MAX_PRECISION,
+            scale=-cv.type().scale()
+        )
     elif tid == libcudf_types.type_id.DECIMAL32:
-        raise NotImplementedError("decimal32 types are not supported yet. "
-                                  "Use decimal64 instead")
+        return Decimal32Dtype(
+            precision=Decimal32Dtype.MAX_PRECISION,
+            scale=-cv.type().scale()
+        )
     else:
         return cudf_to_np_types[<underlying_type_t_type_id>(tid)]
 
@@ -214,14 +225,19 @@ cdef libcudf_types.data_type dtype_to_data_type(dtype) except *:
         tid = libcudf_types.type_id.LIST
     elif is_struct_dtype(dtype):
         tid = libcudf_types.type_id.STRUCT
-    elif is_decimal_dtype(dtype):
+    elif is_decimal64_dtype(dtype):
         tid = libcudf_types.type_id.DECIMAL64
+    elif is_decimal32_dtype(dtype):
+        tid = libcudf_types.type_id.DECIMAL32
     else:
         tid = <libcudf_types.type_id> (
             <underlying_type_t_type_id> (
                 np_to_cudf_types[np.dtype(dtype)]))
 
-    if tid == libcudf_types.type_id.DECIMAL64:
+    if tid in (
+        libcudf_types.type_id.DECIMAL64,
+        libcudf_types.type_id.DECIMAL32
+    ):
         return libcudf_types.data_type(tid, -dtype.scale)
     else:
         return libcudf_types.data_type(tid)
diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx
index 13eedb34c18..e5dfb5a5c35 100644
--- a/python/cudf/cudf/_lib/utils.pyx
+++ b/python/cudf/cudf/_lib/utils.pyx
@@ -138,12 +138,16 @@ cpdef generate_pandas_metadata(Table table, index):
             index_descriptors.append(descr)
 
     metadata = pa.pandas_compat.construct_metadata(
-        table,
-        col_names,
-        index_levels,
-        index_descriptors,
-        index,
-        types,
+        columns_to_convert=[
+            col
+            for col in table._columns
+        ],
+        df=table,
+        column_names=col_names,
+        index_levels=index_levels,
+        index_descriptors=index_descriptors,
+        preserve_index=index,
+        types=types,
     )
 
     md_dict = json.loads(metadata[b"pandas"])
diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py
index a985efeca51..56398bd4f13 100644
--- a/python/cudf/cudf/api/types.py
+++ b/python/cudf/cudf/api/types.py
@@ -15,9 +15,11 @@
 
 import cudf
 from cudf._lib.scalar import DeviceScalar
-from cudf.core.dtypes import (
+from cudf.core.dtypes import (  # noqa: F401
     _BaseDtype,
     is_categorical_dtype,
+    is_decimal32_dtype,
+    is_decimal64_dtype,
     is_decimal_dtype,
     is_interval_dtype,
     is_list_dtype,
@@ -39,11 +41,15 @@ def is_numeric_dtype(obj):
         Whether or not the array or dtype is of a numeric dtype.
     """
     if isclass(obj):
-        if issubclass(obj, cudf.Decimal64Dtype):
+        if issubclass(obj, (cudf.Decimal32Dtype, cudf.Decimal64Dtype)):
             return True
         if issubclass(obj, _BaseDtype):
             return False
     else:
+        if isinstance(obj, cudf.Decimal32Dtype) or isinstance(
+            getattr(obj, "dtype", None), cudf.Decimal32Dtype
+        ):
+            return True
         if isinstance(obj, cudf.Decimal64Dtype) or isinstance(
             getattr(obj, "dtype", None), cudf.Decimal64Dtype
         ):
diff --git a/python/cudf/cudf/core/column/__init__.py b/python/cudf/cudf/core/column/__init__.py
index 76d38e00790..18d48e16480 100644
--- a/python/cudf/cudf/core/column/__init__.py
+++ b/python/cudf/cudf/core/column/__init__.py
@@ -23,4 +23,7 @@
 from cudf.core.column.struct import StructColumn  # noqa: F401
 from cudf.core.column.timedelta import TimeDeltaColumn  # noqa: F401
 from cudf.core.column.interval import IntervalColumn  # noqa: F401
-from cudf.core.column.decimal import DecimalColumn  # noqa: F401
+from cudf.core.column.decimal import (  # noqa: F401
+    Decimal32Column,
+    Decimal64Column,
+)
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index e2aa20cc948..cbcc30d38a7 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -12,7 +12,6 @@
     Optional,
     Sequence,
     Tuple,
-    Union,
     cast,
 )
 
@@ -28,14 +27,14 @@
 from cudf._typing import ColumnLike, Dtype, ScalarLike
 from cudf.core.buffer import Buffer
 from cudf.core.column import column
-from cudf.core.column.methods import ColumnMethodsMixin
+from cudf.core.column.methods import ColumnMethodsMixin, ParentType
 from cudf.core.dtypes import CategoricalDtype
 from cudf.utils.dtypes import (
     is_categorical_dtype,
+    is_interval_dtype,
     is_mixed_with_object_dtype,
     min_signed_type,
     min_unsigned_type,
-    is_interval_dtype,
 )
 
 if TYPE_CHECKING:
@@ -48,9 +47,6 @@
     )
 
 
-ParentType = Union["cudf.Series", "cudf.Index"]
-
-
 class CategoricalAccessor(ColumnMethodsMixin):
     _column: CategoricalColumn
 
@@ -1388,10 +1384,10 @@ def as_categorical_column(
             new_categories=dtype.categories, ordered=dtype.ordered
         )
 
-    def as_numerical_column(self, dtype: Dtype) -> NumericalColumn:
+    def as_numerical_column(self, dtype: Dtype, **kwargs) -> NumericalColumn:
         return self._get_decategorized_column().as_numerical_column(dtype)
 
-    def as_string_column(self, dtype, format=None) -> StringColumn:
+    def as_string_column(self, dtype, format=None, **kwargs) -> StringColumn:
         return self._get_decategorized_column().as_string_column(
             dtype, format=format
         )
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 79d97a3dbe1..9e0fd9da824 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -37,6 +37,24 @@
 from cudf._lib.stream_compaction import distinct_count as cpp_distinct_count
 from cudf._lib.transform import bools_to_mask
 from cudf._typing import BinaryOperand, ColumnLike, Dtype, ScalarLike
+from cudf.api.types import (
+    _is_non_decimal_numeric_dtype,
+    _is_scalar_or_zero_d_array,
+    infer_dtype,
+    is_bool_dtype,
+    is_categorical_dtype,
+    is_decimal32_dtype,
+    is_decimal64_dtype,
+    is_decimal_dtype,
+    is_dtype_equal,
+    is_integer_dtype,
+    is_interval_dtype,
+    is_list_dtype,
+    is_scalar,
+    is_string_dtype,
+    is_struct_dtype,
+    pandas_dtype,
+)
 from cudf.core.abc import Serializable
 from cudf.core.buffer import Buffer
 from cudf.core.dtypes import (
@@ -47,31 +65,14 @@
 )
 from cudf.utils import ioutils, utils
 from cudf.utils.dtypes import (
-    _is_non_decimal_numeric_dtype,
-    _is_scalar_or_zero_d_array,
     check_cast_unsupported_dtype,
     cudf_dtype_from_pa_type,
     get_time_unit,
-    is_categorical_dtype,
-    is_decimal_dtype,
-    is_interval_dtype,
-    is_list_dtype,
-    is_scalar,
-    is_string_dtype,
-    is_struct_dtype,
     min_unsigned_type,
     np_to_pa_dtype,
 )
 from cudf.utils.utils import mask_dtype
 
-from ...api.types import (
-    infer_dtype,
-    is_bool_dtype,
-    is_dtype_equal,
-    is_integer_dtype,
-    pandas_dtype,
-)
-
 T = TypeVar("T", bound="ColumnBase")
 
 
@@ -279,7 +280,7 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase:
         ):
             return cudf.core.column.IntervalColumn.from_arrow(array)
         elif isinstance(array.type, pa.Decimal128Type):
-            return cudf.core.column.DecimalColumn.from_arrow(array)
+            return cudf.core.column.Decimal64Column.from_arrow(array)
 
         result = libcudf.interop.from_arrow(data, data.column_names)._data[
             "None"
@@ -877,7 +878,7 @@ def can_cast_safely(self, to_dtype: Dtype) -> bool:
 
     def astype(self, dtype: Dtype, **kwargs) -> ColumnBase:
         if _is_non_decimal_numeric_dtype(dtype):
-            return self.as_numerical_column(dtype)
+            return self.as_numerical_column(dtype, **kwargs)
         elif is_categorical_dtype(dtype):
             return self.as_categorical_column(dtype, **kwargs)
         elif pandas_dtype(dtype).type in {
@@ -892,6 +893,12 @@ def astype(self, dtype: Dtype, **kwargs) -> ColumnBase:
                     "Casting list columns not currently supported"
                 )
             return self
+        elif is_struct_dtype(dtype):
+            if not self.dtype == dtype:
+                raise NotImplementedError(
+                    "Casting struct columns not currently supported"
+                )
+            return self
         elif is_interval_dtype(self.dtype):
             return self.as_interval_column(dtype, **kwargs)
         elif is_decimal_dtype(dtype):
@@ -901,7 +908,7 @@ def astype(self, dtype: Dtype, **kwargs) -> ColumnBase:
         elif np.issubdtype(dtype, np.timedelta64):
             return self.as_timedelta_column(dtype, **kwargs)
         else:
-            return self.as_numerical_column(dtype)
+            return self.as_numerical_column(dtype, **kwargs)
 
     def as_categorical_column(self, dtype, **kwargs) -> ColumnBase:
         if "ordered" in kwargs:
@@ -947,7 +954,7 @@ def as_categorical_column(self, dtype, **kwargs) -> ColumnBase:
         )
 
     def as_numerical_column(
-        self, dtype: Dtype
+        self, dtype: Dtype, **kwargs
     ) -> "cudf.core.column.NumericalColumn":
         raise NotImplementedError
 
@@ -967,13 +974,25 @@ def as_timedelta_column(
         raise NotImplementedError
 
     def as_string_column(
-        self, dtype: Dtype, format=None
+        self, dtype: Dtype, format=None, **kwargs
     ) -> "cudf.core.column.StringColumn":
         raise NotImplementedError
 
     def as_decimal_column(
         self, dtype: Dtype, **kwargs
-    ) -> "cudf.core.column.DecimalColumn":
+    ) -> Union[
+        "cudf.core.column.Decimal32Column", "cudf.core.column.Decimal64Column"
+    ]:
+        raise NotImplementedError
+
+    def as_decimal64_column(
+        self, dtype: Dtype, **kwargs
+    ) -> "cudf.core.column.Decimal64Column":
+        raise NotImplementedError
+
+    def as_decimal32_column(
+        self, dtype: Dtype, **kwargs
+    ) -> "cudf.core.column.Decimal32Column":
         raise NotImplementedError
 
     def apply_boolean_mask(self, mask) -> ColumnBase:
@@ -1468,10 +1487,22 @@ def build_column(
             null_count=null_count,
             children=children,
         )
-    elif is_decimal_dtype(dtype):
+    elif is_decimal64_dtype(dtype):
+        if size is None:
+            raise TypeError("Must specify size")
+        return cudf.core.column.Decimal64Column(
+            data=data,
+            size=size,
+            offset=offset,
+            dtype=dtype,
+            mask=mask,
+            null_count=null_count,
+            children=children,
+        )
+    elif is_decimal32_dtype(dtype):
         if size is None:
             raise TypeError("Must specify size")
-        return cudf.core.column.DecimalColumn(
+        return cudf.core.column.Decimal32Column(
             data=data,
             size=size,
             offset=offset,
@@ -2020,8 +2051,20 @@ def as_column(
                                 precision=dtype.precision, scale=dtype.scale
                             ),
                         )
-                        return cudf.core.column.DecimalColumn.from_arrow(data)
-                    dtype = pandas_dtype(dtype)
+                        return cudf.core.column.Decimal64Column.from_arrow(
+                            data
+                        )
+                    if isinstance(dtype, cudf.core.dtypes.Decimal32Dtype):
+                        data = pa.array(
+                            arbitrary,
+                            type=pa.decimal128(
+                                precision=dtype.precision, scale=dtype.scale
+                            ),
+                        )
+                        return cudf.core.column.Decimal32Column.from_arrow(
+                            data
+                        )
+                    dtype = pd.api.types.pandas_dtype(dtype)
                     if is_categorical_dtype(dtype) or is_interval_dtype(dtype):
                         raise TypeError
                     else:
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index b96a49c2514..f3d1880b290 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -133,6 +133,14 @@ def second(self) -> ColumnBase:
     def weekday(self) -> ColumnBase:
         return self.get_dt_field("weekday")
 
+    @property
+    def dayofyear(self) -> ColumnBase:
+        return self.get_dt_field("day_of_year")
+
+    @property
+    def day_of_year(self) -> ColumnBase:
+        return self.get_dt_field("day_of_year")
+
     def to_pandas(
         self, index: pd.Index = None, nullable: bool = False, **kwargs
     ) -> "cudf.Series":
@@ -241,14 +249,14 @@ def as_timedelta_column(
         )
 
     def as_numerical_column(
-        self, dtype: Dtype
+        self, dtype: Dtype, **kwargs
     ) -> "cudf.core.column.NumericalColumn":
         return cast(
             "cudf.core.column.NumericalColumn", self.as_numerical.astype(dtype)
         )
 
     def as_string_column(
-        self, dtype: Dtype, format=None
+        self, dtype: Dtype, format=None, **kwargs
     ) -> "cudf.core.column.StringColumn":
         if format is None:
             format = _dtype_to_format_conversion.get(
diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py
index b6bd2f18144..c667799c7c2 100644
--- a/python/cudf/cudf/core/column/decimal.py
+++ b/python/cudf/cudf/core/column/decimal.py
@@ -15,17 +15,70 @@
     from_decimal as cpp_from_decimal,
 )
 from cudf._typing import Dtype
+from cudf.api.types import is_integer_dtype
 from cudf.core.buffer import Buffer
-from cudf.core.column import ColumnBase, NumericalColumn, as_column
-from cudf.core.dtypes import Decimal64Dtype
+from cudf.core.column import ColumnBase, as_column
+from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype
 from cudf.utils.dtypes import is_scalar
 from cudf.utils.utils import pa_mask_buffer_to_mask
 
 from .numerical_base import NumericalBaseColumn
-from ...api.types import is_integer_dtype
 
 
-class DecimalColumn(NumericalBaseColumn):
+class Decimal32Column(NumericalBaseColumn):
+    dtype: Decimal32Dtype
+
+    @classmethod
+    def from_arrow(cls, data: pa.Array):
+        dtype = Decimal32Dtype.from_arrow(data.type)
+        mask_buf = data.buffers()[0]
+        mask = (
+            mask_buf
+            if mask_buf is None
+            else pa_mask_buffer_to_mask(mask_buf, len(data))
+        )
+        data_128 = cp.array(np.frombuffer(data.buffers()[1]).view("int32"))
+        data_32 = data_128[::4].copy()
+        return cls(
+            data=Buffer(data_32.view("uint8")),
+            size=len(data),
+            dtype=dtype,
+            offset=data.offset,
+            mask=mask,
+        )
+
+    def to_arrow(self):
+        data_buf_32 = self.base_data.to_host_array().view("int32")
+        data_buf_128 = np.empty(len(data_buf_32) * 4, dtype="int32")
+
+        # use striding to set the first 32 bits of each 128-bit chunk:
+        data_buf_128[::4] = data_buf_32
+        # use striding again to set the remaining bits of each 128-bit chunk:
+        # 0 for non-negative values, -1 for negative values:
+        data_buf_128[1::4] = np.piecewise(
+            data_buf_32, [data_buf_32 < 0], [-1, 0]
+        )
+        data_buf_128[2::4] = np.piecewise(
+            data_buf_32, [data_buf_32 < 0], [-1, 0]
+        )
+        data_buf_128[3::4] = np.piecewise(
+            data_buf_32, [data_buf_32 < 0], [-1, 0]
+        )
+        data_buf = pa.py_buffer(data_buf_128)
+        mask_buf = (
+            self.base_mask
+            if self.base_mask is None
+            else pa.py_buffer(self.base_mask.to_host_array())
+        )
+        return pa.Array.from_buffers(
+            type=self.dtype.to_arrow(),
+            offset=self._offset,
+            length=self.size,
+            buffers=[mask_buf, data_buf],
+        )
+
+
+class Decimal64Column(NumericalBaseColumn):
     dtype: Decimal64Dtype
 
     def __truediv__(self, other):
@@ -61,6 +114,7 @@ def from_arrow(cls, data: pa.Array):
     def to_arrow(self):
         data_buf_64 = self.base_data.to_host_array().view("int64")
         data_buf_128 = np.empty(len(data_buf_64) * 2, dtype="int64")
+
         # use striding to set the first 64 bits of each 128-bit chunk:
         data_buf_128[::2] = data_buf_64
         # use striding again to set the remaining bits of each 128-bit chunk:
@@ -99,7 +153,11 @@ def binary_operator(self, op, other, reflect=False):
         elif op in ("eq", "ne", "lt", "gt", "le", "ge"):
             if not isinstance(
                 other,
-                (DecimalColumn, cudf.core.column.NumericalColumn, cudf.Scalar),
+                (
+                    Decimal64Column,
+                    cudf.core.column.NumericalColumn,
+                    cudf.Scalar,
+                ),
             ):
                 raise TypeError(
                     f"Operator {op} not supported between"
@@ -146,7 +204,9 @@ def _decimal_quantile(
 
     def as_decimal_column(
         self, dtype: Dtype, **kwargs
-    ) -> "cudf.core.column.DecimalColumn":
+    ) -> Union[
+        "cudf.core.column.Decimal32Column", "cudf.core.column.Decimal64Column"
+    ]:
         if (
             isinstance(dtype, Decimal64Dtype)
             and dtype.scale < self.dtype.scale
@@ -161,12 +221,12 @@ def as_decimal_column(
         return libcudf.unary.cast(self, dtype)
 
     def as_numerical_column(
-        self, dtype: Dtype
+        self, dtype: Dtype, **kwargs
     ) -> "cudf.core.column.NumericalColumn":
         return libcudf.unary.cast(self, dtype)
 
     def as_string_column(
-        self, dtype: Dtype, format=None
+        self, dtype: Dtype, format=None, **kwargs
     ) -> "cudf.core.column.StringColumn":
         if len(self) > 0:
             return cpp_from_decimal(self)
@@ -185,8 +245,8 @@ def fillna(
         if isinstance(value, (int, Decimal)):
             value = cudf.Scalar(value, dtype=self.dtype)
         elif (
-            isinstance(value, DecimalColumn)
-            or isinstance(value, NumericalColumn)
+            isinstance(value, Decimal64Column)
+            or isinstance(value, cudf.core.column.NumericalColumn)
             and is_integer_dtype(value.dtype)
         ):
             value = value.astype(self.dtype)
@@ -220,8 +280,8 @@ def __cuda_array_interface__(self):
         )
 
     def _with_type_metadata(
-        self: "cudf.core.column.DecimalColumn", dtype: Dtype
-    ) -> "cudf.core.column.DecimalColumn":
+        self: "cudf.core.column.Decimal64Column", dtype: Dtype
+    ) -> "cudf.core.column.Decimal64Column":
         if isinstance(dtype, Decimal64Dtype):
             self.dtype.precision = dtype.precision
 
diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
index 8257e8aa6d0..843190f38aa 100644
--- a/python/cudf/cudf/core/column/lists.py
+++ b/python/cudf/cudf/core/column/lists.py
@@ -8,6 +8,7 @@
 import cudf
 from cudf._lib.copying import segmented_gather
 from cudf._lib.lists import (
+    concatenate_list_elements,
     concatenate_rows,
     contains_scalar,
     count_elements,
@@ -16,15 +17,17 @@
     sort_lists,
 )
 from cudf._lib.table import Table
-from cudf._typing import BinaryOperand, Dtype
+from cudf._typing import BinaryOperand, ColumnLike, Dtype, ScalarLike
 from cudf.core.buffer import Buffer
 from cudf.core.column import ColumnBase, as_column, column
-from cudf.core.column.methods import ColumnMethodsMixin
+from cudf.core.column.methods import ColumnMethodsMixin, ParentType
 from cudf.core.dtypes import ListDtype
 from cudf.utils.dtypes import _is_non_decimal_numeric_dtype, is_list_dtype
 
 
 class ListColumn(ColumnBase):
+    dtype: ListDtype
+
     def __init__(
         self, size, dtype, mask=None, offset=0, null_count=None, children=(),
     ):
@@ -74,6 +77,18 @@ def __sizeof__(self):
 
         return self._cached_sizeof
 
+    def __setitem__(self, key, value):
+        if isinstance(value, list):
+            value = cudf.Scalar(value)
+        if isinstance(value, cudf.Scalar):
+            if value.dtype != self.dtype:
+                raise TypeError("list nesting level mismatch")
+        elif value is cudf.NA:
+            value = cudf.Scalar(value, dtype=self.dtype)
+        else:
+            raise ValueError(f"Can not set {value} into ListColumn")
+        super().__setitem__(key, value)
+
     @property
     def base_size(self):
         # in some cases, libcudf will return an empty ListColumn with no
@@ -266,14 +281,16 @@ class ListMethods(ColumnMethodsMixin):
     List methods for Series
     """
 
-    def __init__(self, column, parent=None):
+    _column: ListColumn
+
+    def __init__(self, column: ListColumn, parent: ParentType = None):
         if not is_list_dtype(column.dtype):
             raise AttributeError(
                 "Can only use .list accessor with a 'list' dtype"
             )
         super().__init__(column=column, parent=parent)
 
-    def get(self, index):
+    def get(self, index: int) -> ParentType:
         """
         Extract element at the given index from each component
 
@@ -305,10 +322,10 @@ def get(self, index):
         else:
             raise IndexError("list index out of range")
 
-    def contains(self, search_key):
+    def contains(self, search_key: ScalarLike) -> ParentType:
         """
-        Creates a column of bool values indicating whether the specified scalar
-        is an element of each row of a list column.
+        Returns boolean values indicating whether the specified scalar
+        is an element of each row.
 
         Parameters
         ----------
@@ -317,7 +334,7 @@ def contains(self, search_key):
 
         Returns
         -------
-        Column
+        Series or Index
 
         Examples
         --------
@@ -345,14 +362,14 @@ def contains(self, search_key):
             return res
 
     @property
-    def leaves(self):
+    def leaves(self) -> ParentType:
         """
         From a Series of (possibly nested) lists, obtain the elements from
         the innermost lists as a flat Series (one value per row).
 
         Returns
         -------
-        Series
+        Series or Index
 
         Examples
         --------
@@ -373,7 +390,7 @@ def leaves(self):
                 self._column.elements, retain_index=False
             )
 
-    def len(self):
+    def len(self) -> ParentType:
         """
         Computes the length of each element in the Series/Index.
 
@@ -397,18 +414,18 @@ def len(self):
         """
         return self._return_or_inplace(count_elements(self._column))
 
-    def take(self, lists_indices):
+    def take(self, lists_indices: ColumnLike) -> ParentType:
         """
         Collect list elements based on given indices.
 
         Parameters
         ----------
-        lists_indices: List type arrays
+        lists_indices: Series-like of lists
             Specifies what to collect from each row
 
         Returns
         -------
-        ListColumn
+        Series or Index
 
         Examples
         --------
@@ -452,14 +469,14 @@ def take(self, lists_indices):
         else:
             return res
 
-    def unique(self):
+    def unique(self) -> ParentType:
         """
-        Returns unique element for each list in the column, order for each
-        unique element is not guaranteed.
+        Returns the unique elements in each list.
+        The ordering of elements is not guaranteed.
 
         Returns
         -------
-        ListColumn
+        Series or Index
 
         Examples
         --------
@@ -489,12 +506,12 @@ def unique(self):
 
     def sort_values(
         self,
-        ascending=True,
-        inplace=False,
-        kind="quicksort",
-        na_position="last",
-        ignore_index=False,
-    ):
+        ascending: bool = True,
+        inplace: bool = False,
+        kind: str = "quicksort",
+        na_position: str = "last",
+        ignore_index: bool = False,
+    ) -> ParentType:
         """
         Sort each list by the values.
 
@@ -511,7 +528,7 @@ def sort_values(
 
         Returns
         -------
-        ListColumn with each list sorted
+        Series or Index with each list sorted
 
         Notes
         -----
@@ -540,3 +557,59 @@ def sort_values(
             sort_lists(self._column, ascending, na_position),
             retain_index=not ignore_index,
         )
+
+    def concat(self, dropna=True) -> ParentType:
+        """
+        For a column with at least one level of nesting, concatenate the
+        lists in each row.
+
+        Parameters
+        ----------
+        dropna: bool, optional
+            If True (default), ignores top-level null elements in each row.
+            If False, and top-level null elements are present, the resulting
+            row in the output is null.
+
+        Returns
+        -------
+        Series or Index
+
+        Examples
+        --------
+        >>> s1
+        0      [[1.0, 2.0], [3.0, 4.0, 5.0]]
+        1    [[6.0, None], [7.0], [8.0, 9.0]]
+        dtype: list
+        >>> s1.list.concat()
+        0    [1.0, 2.0, 3.0, 4.0, 5.0]
+        1    [6.0, None, 7.0, 8.0, 9.0]
+        dtype: list
+
+        Null values at the top-level in each row are dropped by default:
+
+        >>> s2
+        0    [[1.0, 2.0], None, [3.0, 4.0, 5.0]]
+        1        [[6.0, None], [7.0], [8.0, 9.0]]
+        dtype: list
+        >>> s2.list.concat()
+        0    [1.0, 2.0, 3.0, 4.0, 5.0]
+        1    [6.0, None, 7.0, 8.0, 9.0]
+        dtype: list
+
+        Use ``dropna=False`` to produce a null instead:
+
+        >>> s2.list.concat(dropna=False)
+        0                         None
+        1    [6.0, nan, 7.0, 8.0, 9.0]
+        dtype: list
+        """
+        try:
+            result = concatenate_list_elements(self._column, dropna=dropna)
+        except RuntimeError as e:
+            if "Rows of the input column must be lists." in str(e):
+                raise ValueError(
+                    "list.concat() can only be called on "
+                    "list columns with at least one level "
+                    "of nesting"
+                )
+        return self._return_or_inplace(result)
diff --git a/python/cudf/cudf/core/column/methods.py b/python/cudf/cudf/core/column/methods.py
index d7b416d06c9..4b448e27a53 100644
--- a/python/cudf/cudf/core/column/methods.py
+++ b/python/cudf/cudf/core/column/methods.py
@@ -11,15 +11,17 @@
 if TYPE_CHECKING:
     from cudf.core.column import ColumnBase
 
+ParentType = Union["cudf.Series", "cudf.BaseIndex"]
+
 
 class ColumnMethodsMixin:
     _column: ColumnBase
-    _parent: Optional[Union["cudf.Series", "cudf.Index"]]
+    _parent: Optional[Union["cudf.Series", "cudf.BaseIndex"]]
 
     def __init__(
         self,
         column: ColumnBase,
-        parent: Union["cudf.Series", "cudf.Index"] = None,
+        parent: Union["cudf.Series", "cudf.BaseIndex"] = None,
     ):
         self._column = column
         self._parent = parent
@@ -27,13 +29,13 @@ def __init__(
     @overload
     def _return_or_inplace(
         self, new_col, inplace: Literal[False], expand=False, retain_index=True
-    ) -> Union["cudf.Series", "cudf.Index"]:
+    ) -> Union["cudf.Series", "cudf.BaseIndex"]:
         ...
 
     @overload
     def _return_or_inplace(
         self, new_col, expand: bool = False, retain_index: bool = True
-    ) -> Union["cudf.Series", "cudf.Index"]:
+    ) -> Union["cudf.Series", "cudf.BaseIndex"]:
         ...
 
     @overload
@@ -49,7 +51,7 @@ def _return_or_inplace(
         inplace: bool = False,
         expand: bool = False,
         retain_index: bool = True,
-    ) -> Optional[Union["cudf.Series", "cudf.Index"]]:
+    ) -> Optional[Union["cudf.Series", "cudf.BaseIndex"]]:
         ...
 
     def _return_or_inplace(
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 17e0b6e454f..267f6082f96 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -12,6 +12,7 @@
 import cudf
 from cudf import _lib as libcudf
 from cudf._typing import BinaryOperand, ColumnLike, Dtype, DtypeObj, ScalarLike
+from cudf.api.types import is_integer_dtype, is_number
 from cudf.core.buffer import Buffer
 from cudf.core.column import (
     ColumnBase,
@@ -31,7 +32,6 @@
     to_cudf_compatible_scalar,
 )
 
-from ...api.types import is_integer_dtype, is_number
 from .numerical_base import NumericalBaseColumn
 
 
@@ -139,14 +139,14 @@ def binary_operator(
                     (
                         NumericalColumn,
                         cudf.Scalar,
-                        cudf.core.column.DecimalColumn,
+                        cudf.core.column.Decimal64Column,
                     ),
                 )
                 or np.isscalar(rhs)
             ):
                 msg = "{!r} operator not supported between {} and {}"
                 raise TypeError(msg.format(binop, type(self), type(rhs)))
-            if isinstance(rhs, cudf.core.column.DecimalColumn):
+            if isinstance(rhs, cudf.core.column.Decimal64Column):
                 lhs: Union[ScalarLike, ColumnBase] = self.as_decimal_column(
                     Decimal64Dtype(Decimal64Dtype.MAX_PRECISION, 0)
                 )
@@ -208,7 +208,7 @@ def int2ip(self) -> "cudf.core.column.StringColumn":
         return libcudf.string_casting.int2ip(self)
 
     def as_string_column(
-        self, dtype: Dtype, format=None
+        self, dtype: Dtype, format=None, **kwargs
     ) -> "cudf.core.column.StringColumn":
         if len(self) > 0:
             return string._numeric_to_str_typecast_functions[
@@ -249,10 +249,10 @@ def as_timedelta_column(
 
     def as_decimal_column(
         self, dtype: Dtype, **kwargs
-    ) -> "cudf.core.column.DecimalColumn":
+    ) -> "cudf.core.column.Decimal64Column":
         return libcudf.unary.cast(self, dtype)
 
-    def as_numerical_column(self, dtype: Dtype) -> NumericalColumn:
+    def as_numerical_column(self, dtype: Dtype, **kwargs) -> NumericalColumn:
         dtype = np.dtype(dtype)
         if dtype == self.dtype:
             return self
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index dd1c0c1e4ac..e17ecec766a 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -158,9 +158,10 @@
 )
 from cudf._lib.strings.wrap import wrap as cpp_wrap
 from cudf._typing import ColumnLike, Dtype, ScalarLike
+from cudf.api.types import is_integer
 from cudf.core.buffer import Buffer
 from cudf.core.column import column, datetime
-from cudf.core.column.methods import ColumnMethodsMixin
+from cudf.core.column.methods import ColumnMethodsMixin, ParentType
 from cudf.utils import utils
 from cudf.utils.docutils import copy_docstring
 from cudf.utils.dtypes import (
@@ -170,7 +171,11 @@
     is_string_dtype,
 )
 
-from ...api.types import is_integer
+
+def str_to_boolean(column: StringColumn):
+    """Takes in string column and returns boolean column """
+    return (column.str().len() > cudf.Scalar(0, dtype="int8")).fillna(False)
+
 
 _str_to_numeric_typecast_functions = {
     np.dtype("int8"): str_cast.stoi8,
@@ -183,7 +188,7 @@
     np.dtype("uint64"): str_cast.stoul,
     np.dtype("float32"): str_cast.stof,
     np.dtype("float64"): str_cast.stod,
-    np.dtype("bool"): str_cast.to_booleans,
+    np.dtype("bool"): str_to_boolean,
 }
 
 _numeric_to_str_typecast_functions = {
@@ -217,9 +222,6 @@
 }
 
 
-ParentType = Union["cudf.Series", "cudf.core.index.BaseIndex"]
-
-
 class StringMethods(ColumnMethodsMixin):
     def __init__(self, column, parent=None):
         """
@@ -4618,6 +4620,9 @@ def subword_tokenize(
         This function requires about 21x the number of character bytes
         in the input strings column as working memory.
 
+        ``ser.str.subword_tokenize`` will be depreciated in future versions.
+        Use ``cudf.core.subword_tokenizer.SubwordTokenizer`` instead.
+
         Parameters
         ----------
         hash_file : str
@@ -4689,6 +4694,14 @@ def subword_tokenize(
         array([[0, 0, 2],
                [1, 0, 1]], dtype=uint32)
         """
+        warning_message = (
+            "`ser.str.subword_tokenize` API will be depreciated"
+            " in future versions of cudf.\n"
+            "Use `cudf.core.subword_tokenizer.SubwordTokenizer` "
+            "instead"
+        )
+
+        warnings.warn(warning_message, FutureWarning)
         tokens, masks, metadata = cpp_subword_tokenize_vocab_file(
             self._column,
             hash_file,
@@ -5118,7 +5131,7 @@ def str(self, parent: ParentType = None) -> StringMethods:
         return StringMethods(self, parent=parent)
 
     def as_numerical_column(
-        self, dtype: Dtype
+        self, dtype: Dtype, **kwargs
     ) -> "cudf.core.column.NumericalColumn":
         out_dtype = np.dtype(dtype)
 
@@ -5192,10 +5205,12 @@ def as_timedelta_column(
 
     def as_decimal_column(
         self, dtype: Dtype, **kwargs
-    ) -> "cudf.core.column.DecimalColumn":
+    ) -> "cudf.core.column.Decimal64Column":
         return cpp_to_decimal(self, dtype)
 
-    def as_string_column(self, dtype: Dtype, format=None) -> StringColumn:
+    def as_string_column(
+        self, dtype: Dtype, format=None, **kwargs
+    ) -> StringColumn:
         return self
 
     @property
diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py
index 03b1100c385..85c8293a91e 100644
--- a/python/cudf/cudf/core/column/struct.py
+++ b/python/cudf/cudf/core/column/struct.py
@@ -80,6 +80,15 @@ def to_arrow(self):
             pa_type, len(self), buffers, children=children
         )
 
+    def __getitem__(self, args):
+        result = super().__getitem__(args)
+        if isinstance(result, dict):
+            return {
+                field: value
+                for field, value in zip(self.dtype.fields, result.values())
+            }
+        return result
+
     def copy(self, deep=True):
         result = super().copy(deep=deep)
         if deep:
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index b202838662c..a27c20cc50c 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -322,7 +322,7 @@ def fillna(
             return super().fillna(method=method)
 
     def as_numerical_column(
-        self, dtype: Dtype
+        self, dtype: Dtype, **kwargs
     ) -> "cudf.core.column.NumericalColumn":
         return cast(
             "cudf.core.column.NumericalColumn", self.as_numerical.astype(dtype)
@@ -336,7 +336,7 @@ def as_datetime_column(
         )
 
     def as_string_column(
-        self, dtype: Dtype, format=None
+        self, dtype: Dtype, format=None, **kwargs
     ) -> "cudf.core.column.StringColumn":
         if format is None:
             format = _dtype_to_format_conversion.get(
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 436f14cf6e3..c02bf3d11a4 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -239,12 +239,12 @@ def __init__(self, data=None, index=None, columns=None, dtype=None):
                 self._index = as_index(index)
             if columns is not None:
                 self._data = ColumnAccessor(
-                    dict.fromkeys(
-                        columns,
-                        column.column_empty(
+                    {
+                        k: column.column_empty(
                             len(self), dtype="object", masked=True
-                        ),
-                    )
+                        )
+                        for k in columns
+                    }
                 )
         elif hasattr(data, "__cuda_array_interface__"):
             arr_interface = data.__cuda_array_interface__
@@ -5652,11 +5652,12 @@ def to_arrow(self, preserve_index=True):
 
         out = super(DataFrame, data).to_arrow()
         metadata = pa.pandas_compat.construct_metadata(
-            self,
-            out.schema.names,
-            [self.index],
-            index_descr,
-            preserve_index,
+            columns_to_convert=[self[col] for col in self._data.names],
+            df=self,
+            column_names=out.schema.names,
+            index_levels=[self.index],
+            index_descriptors=index_descr,
+            preserve_index=preserve_index,
             types=out.schema.types,
         )
 
diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index e63c538c108..6dbe55d0bb8 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -308,9 +308,117 @@ def deserialize(cls, header: dict, frames: list):
         return cls(fields)
 
 
+class Decimal32Dtype(_BaseDtype):
+
+    name = "decimal32"
+    _metadata = ("precision", "scale")
+    MAX_PRECISION = np.floor(np.log10(np.iinfo("int32").max))
+
+    def __init__(self, precision, scale=0):
+        """
+        Parameters
+        ----------
+        precision : int
+            The total number of digits in each value of this dtype
+        scale : int, optional
+            The scale of the Decimal32Dtype. See Notes below.
+
+        Notes
+        -----
+            When the scale is positive:
+              - numbers with fractional parts (e.g., 0.0042) can be represented
+              - the scale is the total number of digits to the right of the
+                decimal point
+            When the scale is negative:
+              - only multiples of powers of 10 (including 10**0) can be
+                represented (e.g., 1729, 4200, 1000000)
+              - the scale represents the number of trailing zeros in the value.
+            For example, 42 is representable with precision=2 and scale=0.
+            13.0051 is representable with precision=6 and scale=4,
+            and *not* representable with precision<6 or scale<4.
+        """
+        self._validate(precision, scale)
+        self._typ = pa.decimal128(precision, scale)
+
+    @property
+    def str(self):
+        return f"decimal32({self.precision}, {self.scale})"
+
+    @property
+    def precision(self):
+        return self._typ.precision
+
+    @precision.setter
+    def precision(self, value):
+        self._validate(value, self.scale)
+        self._typ = pa.decimal128(precision=value, scale=self.scale)
+
+    @property
+    def scale(self):
+        return self._typ.scale
+
+    @property
+    def type(self):
+        # might need to account for precision and scale here
+        return decimal.Decimal
+
+    def to_arrow(self):
+        return self._typ
+
+    @classmethod
+    def from_arrow(cls, typ):
+        return cls(typ.precision, typ.scale)
+
+    @property
+    def itemsize(self):
+        return 4
+
+    def __repr__(self):
+        return (
+            f"{self.__class__.__name__}"
+            f"(precision={self.precision}, scale={self.scale})"
+        )
+
+    def __hash__(self):
+        return hash(self._typ)
+
+    @classmethod
+    def _validate(cls, precision, scale=0):
+        if precision > Decimal32Dtype.MAX_PRECISION:
+            raise ValueError(
+                f"Cannot construct a {cls.__name__}"
+                f" with precision > {cls.MAX_PRECISION}"
+            )
+        if abs(scale) > precision:
+            raise ValueError(f"scale={scale} exceeds precision={precision}")
+
+    @classmethod
+    def _from_decimal(cls, decimal):
+        """
+        Create a cudf.Decimal32Dtype from a decimal.Decimal object
+        """
+        metadata = decimal.as_tuple()
+        precision = max(len(metadata.digits), -metadata.exponent)
+        return cls(precision, -metadata.exponent)
+
+    def serialize(self) -> Tuple[dict, list]:
+        return (
+            {
+                "type-serialized": pickle.dumps(type(self)),
+                "precision": self.precision,
+                "scale": self.scale,
+            },
+            [],
+        )
+
+    @classmethod
+    def deserialize(cls, header: dict, frames: list):
+        return cls(header["precision"], header["scale"])
+
+
 class Decimal64Dtype(_BaseDtype):
 
-    name = "decimal"
+    name = "decimal64"
     _metadata = ("precision", "scale")
     MAX_PRECISION = np.floor(np.log10(np.iinfo("int64").max))
 
@@ -578,15 +686,7 @@ def is_decimal_dtype(obj):
     bool
         Whether or not the array-like or dtype is of the decimal dtype.
     """
-    return (
-        type(obj) is cudf.core.dtypes.Decimal64Dtype
-        or obj is cudf.core.dtypes.Decimal64Dtype
-        or (
-            isinstance(obj, str)
-            and obj == cudf.core.dtypes.Decimal64Dtype.name
-        )
-        or (hasattr(obj, "dtype") and is_decimal_dtype(obj.dtype))
-    )
+    return is_decimal32_dtype(obj) or is_decimal64_dtype(obj)
 
 
 def is_interval_dtype(obj):
@@ -613,3 +713,27 @@ def is_interval_dtype(obj):
         )
         or (hasattr(obj, "dtype") and is_interval_dtype(obj.dtype))
     )
+
+
+def is_decimal32_dtype(obj):
+    return (
+        type(obj) is cudf.core.dtypes.Decimal32Dtype
+        or obj is cudf.core.dtypes.Decimal32Dtype
+        or (
+            isinstance(obj, str)
+            and obj == cudf.core.dtypes.Decimal32Dtype.name
+        )
+        or (hasattr(obj, "dtype") and is_decimal32_dtype(obj.dtype))
+    )
+
+
+def is_decimal64_dtype(obj):
+    return (
+        type(obj) is cudf.core.dtypes.Decimal64Dtype
+        or obj is cudf.core.dtypes.Decimal64Dtype
+        or (
+            isinstance(obj, str)
+            and obj == cudf.core.dtypes.Decimal64Dtype.name
+        )
+        or (hasattr(obj, "dtype") and is_decimal64_dtype(obj.dtype))
+    )
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 5e4293f8f8b..3629358ee9f 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -506,7 +506,7 @@ def _concat(
 
         # Reassign precision for any decimal cols
         for name, col in out._data.items():
-            if isinstance(col, cudf.core.column.DecimalColumn):
+            if isinstance(col, cudf.core.column.Decimal64Column):
                 col = col._with_type_metadata(tables[0]._data[name].dtype)
 
         # Reassign index and column names
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index a307aab93dd..13ea1755803 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -2123,6 +2123,15 @@ def __init__(
         if yearfirst is not False:
             raise NotImplementedError("yearfirst == True is not yet supported")
 
+        valid_dtypes = tuple(
+            f"datetime64[{res}]" for res in ("s", "ms", "us", "ns")
+        )
+        if dtype is None:
+            # nanosecond default matches pandas
+            dtype = "datetime64[ns]"
+        elif dtype not in valid_dtypes:
+            raise TypeError("Invalid dtype")
+
         if copy:
             data = column.as_column(data).copy()
         kwargs = _setdefault_name(data, name=name)
@@ -2131,7 +2140,7 @@ def __init__(
         elif isinstance(data, pd.DatetimeIndex):
             data = column.as_column(data.values)
         elif isinstance(data, (list, tuple)):
-            data = column.as_column(np.array(data, dtype="datetime64[ms]"))
+            data = column.as_column(np.array(data, dtype=dtype))
         super().__init__(data, **kwargs)
 
     @property
@@ -2290,6 +2299,50 @@ def dayofweek(self):
         """
         return self._get_dt_field("weekday")
 
+    @property
+    def dayofyear(self):
+        """
+        The day of the year, from 1-365 in non-leap years and
+        from 1-366 in leap years.
+
+        Examples
+        --------
+        >>> import pandas as pd
+        >>> import cudf
+        >>> datetime_index = cudf.Index(pd.date_range("2016-12-31",
+        ...     "2017-01-08", freq="D"))
+        >>> datetime_index
+        DatetimeIndex(['2016-12-31', '2017-01-01', '2017-01-02', '2017-01-03',
+                    '2017-01-04', '2017-01-05', '2017-01-06', '2017-01-07',
+                    '2017-01-08'],
+                    dtype='datetime64[ns]')
+        >>> datetime_index.dayofyear
+        Int16Index([366, 1, 2, 3, 4, 5, 6, 7, 8], dtype='int16')
+        """
+        return self._get_dt_field("day_of_year")
+
+    @property
+    def day_of_year(self):
+        """
+        The day of the year, from 1-365 in non-leap years and
+        from 1-366 in leap years.
+
+        Examples
+        --------
+        >>> import pandas as pd
+        >>> import cudf
+        >>> datetime_index = cudf.Index(pd.date_range("2016-12-31",
+        ...     "2017-01-08", freq="D"))
+        >>> datetime_index
+        DatetimeIndex(['2016-12-31', '2017-01-01', '2017-01-02', '2017-01-03',
+                    '2017-01-04', '2017-01-05', '2017-01-06', '2017-01-07',
+                    '2017-01-08'],
+                    dtype='datetime64[ns]')
+        >>> datetime_index.day_of_year
+        Int16Index([366, 1, 2, 3, 4, 5, 6, 7, 8], dtype='int16')
+        """
+        return self._get_dt_field("day_of_year")
+
     def to_pandas(self):
         nanos = self._values.astype("datetime64[ns]")
         return pd.DatetimeIndex(nanos.to_pandas(), name=self.name)
diff --git a/python/cudf/cudf/core/indexing.py b/python/cudf/cudf/core/indexing.py
index e6359efacd6..933fd768d7c 100755
--- a/python/cudf/cudf/core/indexing.py
+++ b/python/cudf/cudf/core/indexing.py
@@ -93,7 +93,7 @@ def __getitem__(self, arg):
         data = self._sr._column[arg]
 
         if (
-            isinstance(data, list)
+            isinstance(data, (dict, list))
             or _is_scalar_or_zero_d_array(data)
             or _is_null_host_scalar(data)
         ):
@@ -110,7 +110,10 @@ def __setitem__(self, key, value):
         # coerce value into a scalar or column
         if is_scalar(value):
             value = to_cudf_compatible_scalar(value)
-        else:
+        elif not (
+            isinstance(value, list)
+            and isinstance(self._sr._column.dtype, cudf.ListDtype)
+        ):
             value = column.as_column(value)
         if (
             not isinstance(
diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py
index a17d1dad9b3..db9bc6d6c85 100644
--- a/python/cudf/cudf/core/scalar.py
+++ b/python/cudf/cudf/core/scalar.py
@@ -7,7 +7,7 @@
 
 from cudf._lib.scalar import DeviceScalar, _is_null_host_scalar
 from cudf.core.column.column import ColumnBase
-from cudf.core.dtypes import Decimal64Dtype, ListDtype
+from cudf.core.dtypes import Decimal64Dtype, ListDtype, StructDtype
 from cudf.core.index import BaseIndex
 from cudf.core.series import Series
 from cudf.utils.dtypes import (
@@ -127,10 +127,25 @@ def _preprocess_host_value(self, value, dtype):
                 )
                 return value, dtype
         elif isinstance(dtype, ListDtype):
-            if value is not None:
+            if value not in {None, NA}:
                 raise ValueError(f"Can not coerce {value} to ListDtype")
             else:
                 return NA, dtype
+
+        if isinstance(value, dict):
+            if dtype is not None:
+                raise TypeError("dict may not be cast to a different dtype")
+            else:
+                dtype = StructDtype.from_arrow(
+                    pa.infer_type([value], from_pandas=True)
+                )
+                return value, dtype
+        elif isinstance(dtype, StructDtype):
+            if value is not None:
+                raise ValueError(f"Can not coerce {value} to StructDType")
+            else:
+                return NA, dtype
+
         if isinstance(dtype, Decimal64Dtype):
             value = pa.scalar(
                 value, type=pa.decimal128(dtype.precision, dtype.scale)
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 31ebf90b3c2..77640db6a1d 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -2414,7 +2414,7 @@ def _concat(cls, objs, axis=0, index=True):
 
         col = _concat_columns([o._column for o in objs])
 
-        if isinstance(col, cudf.core.column.DecimalColumn):
+        if isinstance(col, cudf.core.column.Decimal64Column):
             col = col._with_type_metadata(objs[0]._column.dtype)
 
         return cls(data=col, index=index, name=name)
@@ -6292,6 +6292,80 @@ def dayofweek(self):
         """
         return self._get_dt_field("weekday")
 
+    @property
+    def dayofyear(self):
+        """
+        The day of the year, from 1-365 in non-leap years and
+        from 1-366 in leap years.
+
+        Examples
+        --------
+        >>> import pandas as pd
+        >>> import cudf
+        >>> datetime_series = cudf.Series(pd.date_range('2016-12-31',
+        ...     '2017-01-08', freq='D'))
+        >>> datetime_series
+        0   2016-12-31
+        1   2017-01-01
+        2   2017-01-02
+        3   2017-01-03
+        4   2017-01-04
+        5   2017-01-05
+        6   2017-01-06
+        7   2017-01-07
+        8   2017-01-08
+        dtype: datetime64[ns]
+        >>> datetime_series.dt.dayofyear
+        0    366
+        1      1
+        2      2
+        3      3
+        4      4
+        5      5
+        6      6
+        7      7
+        8      8
+        dtype: int16
+        """
+        return self._get_dt_field("day_of_year")
+
+    @property
+    def day_of_year(self):
+        """
+        The day of the year, from 1-365 in non-leap years and
+        from 1-366 in leap years.
+
+        Examples
+        --------
+        >>> import pandas as pd
+        >>> import cudf
+        >>> datetime_series = cudf.Series(pd.date_range('2016-12-31',
+        ...     '2017-01-08', freq='D'))
+        >>> datetime_series
+        0   2016-12-31
+        1   2017-01-01
+        2   2017-01-02
+        3   2017-01-03
+        4   2017-01-04
+        5   2017-01-05
+        6   2017-01-06
+        7   2017-01-07
+        8   2017-01-08
+        dtype: datetime64[ns]
+        >>> datetime_series.dt.day_of_year
+        0    366
+        1      1
+        2      2
+        3      3
+        4      4
+        5      5
+        6      6
+        7      7
+        8      8
+        dtype: int16
+        """
+        return self._get_dt_field("day_of_year")
+
     def _get_dt_field(self, field):
         out_column = self.series._column.get_dt_field(field)
         return Series(
diff --git a/python/cudf/cudf/core/window/rolling.py b/python/cudf/cudf/core/window/rolling.py
index 1a17d941da7..d9a2fd89165 100644
--- a/python/cudf/cudf/core/window/rolling.py
+++ b/python/cudf/cudf/core/window/rolling.py
@@ -7,13 +7,12 @@
 
 import cudf
 from cudf import _lib as libcudf
+from cudf.api.types import is_integer, is_number
 from cudf.core import column
 from cudf.core.column.column import as_column
 from cudf.utils import cudautils
 from cudf.utils.utils import GetAttrGetItemMixin
 
-from ...api.types import is_integer, is_number
-
 
 class Rolling(GetAttrGetItemMixin):
     """
diff --git a/python/cudf/cudf/tests/utils.py b/python/cudf/cudf/testing/_utils.py
similarity index 100%
rename from python/cudf/cudf/tests/utils.py
rename to python/cudf/cudf/testing/_utils.py
diff --git a/python/cudf/cudf/tests/dataset_generator.py b/python/cudf/cudf/testing/dataset_generator.py
similarity index 100%
rename from python/cudf/cudf/tests/dataset_generator.py
rename to python/cudf/cudf/testing/dataset_generator.py
diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py
index 16315bbd7f8..cedf2aac7af 100644
--- a/python/cudf/cudf/testing/testing.py
+++ b/python/cudf/cudf/testing/testing.py
@@ -4,10 +4,12 @@
 
 from typing import Union
 
+import cupy as cp
 import numpy as np
 import pandas as pd
 
 import cudf
+from cudf.api.types import is_numeric_dtype
 from cudf.core._compat import PANDAS_GE_110
 from cudf.utils.dtypes import is_categorical_dtype
 
@@ -203,7 +205,17 @@ def assert_column_equal(
 
     columns_equal = False
     try:
-        columns_equal = left.equals(right)
+        columns_equal = (
+            (
+                cp.all(left.isnull().values == right.isnull().values)
+                and cp.allclose(
+                    left[left.isnull().unary_operator("not")].values,
+                    right[right.isnull().unary_operator("not")].values,
+                )
+            )
+            if not check_exact and is_numeric_dtype(left)
+            else left.equals(right)
+        )
     except TypeError as e:
         if str(e) != "Categoricals can only compare with the same type":
             raise e
diff --git a/python/cudf/cudf/tests/__init__.py b/python/cudf/cudf/tests/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.issue.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.issue.orc
new file mode 100644
index 00000000000..55494803196
Binary files /dev/null and b/python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.issue.orc differ
diff --git a/python/cudf/cudf/tests/test_apply_rows.py b/python/cudf/cudf/tests/test_apply_rows.py
index 0ba80278fca..f025549971f 100644
--- a/python/cudf/cudf/tests/test_apply_rows.py
+++ b/python/cudf/cudf/tests/test_apply_rows.py
@@ -2,7 +2,7 @@
 
 import cudf
 from cudf.core.column import column
-from cudf.tests.utils import assert_eq, gen_rand_series
+from cudf.testing._utils import assert_eq, gen_rand_series
 
 
 def _kernel_multiply(a, b, out):
diff --git a/python/cudf/cudf/tests/test_applymap.py b/python/cudf/cudf/tests/test_applymap.py
index 1f35bc93c78..fa3c88a3551 100644
--- a/python/cudf/cudf/tests/test_applymap.py
+++ b/python/cudf/cudf/tests/test_applymap.py
@@ -7,7 +7,7 @@
 import pytest
 
 from cudf import Series
-from cudf.tests import utils
+from cudf.testing import _utils as utils
 
 
 @pytest.mark.parametrize(
diff --git a/python/cudf/cudf/tests/test_array_function.py b/python/cudf/cudf/tests/test_array_function.py
index 03f9cf1d7e5..cd4dd28f179 100644
--- a/python/cudf/cudf/tests/test_array_function.py
+++ b/python/cudf/cudf/tests/test_array_function.py
@@ -4,7 +4,7 @@
 import pytest
 
 import cudf
-from cudf.tests.utils import assert_eq
+from cudf.testing._utils import assert_eq
 from cudf.utils.utils import IS_NEP18_ACTIVE
 
 missing_arrfunc_cond = not IS_NEP18_ACTIVE
diff --git a/python/cudf/cudf/tests/test_array_ufunc.py b/python/cudf/cudf/tests/test_array_ufunc.py
index f9e0bb2ce8a..8cfcf4d2b6d 100644
--- a/python/cudf/cudf/tests/test_array_ufunc.py
+++ b/python/cudf/cudf/tests/test_array_ufunc.py
@@ -1,9 +1,10 @@
-import cudf
-import numpy as np
 import cupy as cp
+import numpy as np
 import pandas as pd
 import pytest
-from cudf.tests.utils import assert_eq
+
+import cudf
+from cudf.testing._utils import assert_eq
 
 
 @pytest.fixture
diff --git a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py
index a52ee937574..48e3b0ec42c 100644
--- a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py
+++ b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py
@@ -18,7 +18,7 @@
 import pytest
 
 import cudf
-from cudf.tests.utils import assert_eq
+from cudf.testing._utils import assert_eq
 
 
 def cudf_from_avro_util(schema, records):
diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py
index 090e03c9403..1c97cbb10ff 100644
--- a/python/cudf/cudf/tests/test_binops.py
+++ b/python/cudf/cudf/tests/test_binops.py
@@ -14,7 +14,7 @@
 import cudf
 from cudf.core import Series
 from cudf.core.index import as_index
-from cudf.tests import utils
+from cudf.testing import _utils as utils
 from cudf.utils.dtypes import (
     BOOL_TYPES,
     DATETIME_TYPES,
@@ -1742,12 +1742,6 @@ def test_binops_with_NA_consistent(dtype, op):
         assert result._column.null_count == len(data)
 
 
-def _decimal_series(input, dtype):
-    return cudf.Series(
-        [x if x is None else decimal.Decimal(x) for x in input], dtype=dtype,
-    )
-
-
 @pytest.mark.parametrize(
     "args",
     [
@@ -2080,10 +2074,10 @@ def _decimal_series(input, dtype):
 def test_binops_decimal(args):
     op, lhs, l_dtype, rhs, r_dtype, expect, expect_dtype = args
 
-    a = _decimal_series(lhs, l_dtype)
-    b = _decimal_series(rhs, r_dtype)
+    a = utils._decimal_series(lhs, l_dtype)
+    b = utils._decimal_series(rhs, r_dtype)
     expect = (
-        _decimal_series(expect, expect_dtype)
+        utils._decimal_series(expect, expect_dtype)
         if isinstance(expect_dtype, cudf.Decimal64Dtype)
         else cudf.Series(expect, dtype=expect_dtype)
     )
@@ -2242,7 +2236,7 @@ def test_binops_decimal(args):
         ),
     ],
 )
-@pytest.mark.parametrize("integer_dtype", cudf.tests.utils.INTEGER_TYPES)
+@pytest.mark.parametrize("integer_dtype", utils.INTEGER_TYPES)
 @pytest.mark.parametrize("reflected", [True, False])
 def test_binops_decimal_comp_mixed_integer(args, integer_dtype, reflected):
     """
@@ -2258,7 +2252,7 @@ def test_binops_decimal_comp_mixed_integer(args, integer_dtype, reflected):
     else:
         op, ldata, ldtype, rdata, _, expected = args
 
-    lhs = _decimal_series(ldata, ldtype)
+    lhs = utils._decimal_series(ldata, ldtype)
     rhs = cudf.Series(rdata, dtype=integer_dtype)
 
     if reflected:
@@ -2746,7 +2740,7 @@ def test_binops_decimal_scalar_compare(args, reflected):
     else:
         op, ldata, ldtype, rdata, _, expected = args
 
-    lhs = _decimal_series(ldata, ldtype)
+    lhs = utils._decimal_series(ldata, ldtype)
     rhs = rdata
 
     if reflected:
diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py
index 7b1aea174c8..6a23f568348 100644
--- a/python/cudf/cudf/tests/test_categorical.py
+++ b/python/cudf/cudf/tests/test_categorical.py
@@ -9,7 +9,11 @@
 
 import cudf
 from cudf.core._compat import PANDAS_GE_110
-from cudf.tests.utils import NUMERIC_TYPES, assert_eq, assert_exceptions_equal
+from cudf.testing._utils import (
+    NUMERIC_TYPES,
+    assert_eq,
+    assert_exceptions_equal,
+)
 
 
 @pytest.fixture
diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py
index 3ac6cc0bb44..f3387b3d27d 100644
--- a/python/cudf/cudf/tests/test_column.py
+++ b/python/cudf/cudf/tests/test_column.py
@@ -8,7 +8,7 @@
 import cudf
 from cudf._lib.transform import mask_to_bools
 from cudf.core.column.column import as_column
-from cudf.tests.utils import assert_eq, assert_exceptions_equal
+from cudf.testing._utils import assert_eq, assert_exceptions_equal
 from cudf.utils import dtypes as dtypeutils
 
 dtypes = sorted(
@@ -140,8 +140,8 @@ def test_column_series_multi_dim(data):
 @pytest.mark.parametrize(
     ("data", "error"),
     [
-        ([1, "1.0", "2", -3], TypeError),
-        ([np.nan, 0, "null", cp.nan], TypeError),
+        ([1, "1.0", "2", -3], pa.lib.ArrowInvalid),
+        ([np.nan, 0, "null", cp.nan], pa.lib.ArrowInvalid),
         (
             [np.int32(4), np.float64(1.5), np.float32(1.290994), np.int8(0)],
             None,
@@ -152,7 +152,7 @@ def test_column_mixed_dtype(data, error):
     if error is None:
         cudf.Series(data)
     else:
-        with pytest.raises(TypeError):
+        with pytest.raises(error):
             cudf.Series(data)
 
 
diff --git a/python/cudf/cudf/tests/test_column_accessor.py b/python/cudf/cudf/tests/test_column_accessor.py
index 86a7927dcac..99d4bdd9910 100644
--- a/python/cudf/cudf/tests/test_column_accessor.py
+++ b/python/cudf/cudf/tests/test_column_accessor.py
@@ -6,7 +6,7 @@
 
 import cudf
 from cudf.core.column_accessor import ColumnAccessor
-from cudf.tests.utils import assert_eq
+from cudf.testing._utils import assert_eq
 
 simple_test_data = [
     {},
diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py
index 5c4c121db4d..2578cb13bff 100644
--- a/python/cudf/cudf/tests/test_concat.py
+++ b/python/cudf/cudf/tests/test_concat.py
@@ -1,16 +1,16 @@
 # Copyright (c) 2018-2021, NVIDIA CORPORATION.
 
 import re
+from decimal import Decimal
 
 import numpy as np
 import pandas as pd
 import pytest
-from decimal import Decimal
 
 import cudf as gd
-from cudf.tests.utils import assert_eq, assert_exceptions_equal
-from cudf.utils.dtypes import is_categorical_dtype
 from cudf.core.dtypes import Decimal64Dtype
+from cudf.testing._utils import assert_eq, assert_exceptions_equal
+from cudf.utils.dtypes import is_categorical_dtype
 
 
 def make_frames(index=None, nulls="none"):
diff --git a/python/cudf/cudf/tests/test_contains.py b/python/cudf/cudf/tests/test_contains.py
index b669c40022e..b6650600261 100644
--- a/python/cudf/cudf/tests/test_contains.py
+++ b/python/cudf/cudf/tests/test_contains.py
@@ -6,7 +6,7 @@
 
 from cudf import Series
 from cudf.core.index import RangeIndex, as_index
-from cudf.tests.utils import (
+from cudf.testing._utils import (
     DATETIME_TYPES,
     NUMERIC_TYPES,
     TIMEDELTA_TYPES,
diff --git a/python/cudf/cudf/tests/test_copying.py b/python/cudf/cudf/tests/test_copying.py
index ed6a1169a2a..0965b5298a4 100644
--- a/python/cudf/cudf/tests/test_copying.py
+++ b/python/cudf/cudf/tests/test_copying.py
@@ -6,7 +6,7 @@
 
 import cudf
 from cudf.core import Series
-from cudf.tests.utils import NUMERIC_TYPES, OTHER_TYPES, assert_eq
+from cudf.testing._utils import NUMERIC_TYPES, OTHER_TYPES, assert_eq
 
 
 @pytest.mark.parametrize("dtype", NUMERIC_TYPES + OTHER_TYPES)
diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py
index 925369048cb..5511a65d0a4 100644
--- a/python/cudf/cudf/tests/test_csv.py
+++ b/python/cudf/cudf/tests/test_csv.py
@@ -14,7 +14,7 @@
 
 import cudf
 from cudf import read_csv
-from cudf.tests.utils import assert_eq, assert_exceptions_equal
+from cudf.testing._utils import assert_eq, assert_exceptions_equal
 
 
 def make_numeric_dataframe(nrows, dtype):
@@ -1261,6 +1261,17 @@ def test_csv_reader_column_names(names):
         assert list(df) == list(names)
 
 
+def test_csv_reader_repeated_column_name():
+    buffer = """A,A,A.1,A,A.2,A,A.4,A,A
+                1,2,3.1,4,a.2,a,a.4,a,a
+                2,4,6.1,8,b.2,b,b.4,b,b"""
+
+    # pandas and cudf to have same repeated column names
+    pdf = pd.read_csv(StringIO(buffer))
+    gdf = cudf.read_csv(StringIO(buffer))
+    assert_eq(pdf.columns, gdf.columns)
+
+
 def test_csv_reader_bools_false_positives(tmpdir):
     # values that are equal to ["True", "TRUE", "False", "FALSE"]
     # when using ints to detect bool values
diff --git a/python/cudf/cudf/tests/test_cuda_apply.py b/python/cudf/cudf/tests/test_cuda_apply.py
index fa880da6804..2604030097b 100644
--- a/python/cudf/cudf/tests/test_cuda_apply.py
+++ b/python/cudf/cudf/tests/test_cuda_apply.py
@@ -9,7 +9,7 @@
 from numba import cuda
 
 from cudf import DataFrame
-from cudf.tests.utils import assert_eq
+from cudf.testing._utils import assert_eq
 
 
 @pytest.mark.parametrize("nelem", [1, 2, 64, 128, 129])
diff --git a/python/cudf/cudf/tests/test_cuda_array_interface.py b/python/cudf/cudf/tests/test_cuda_array_interface.py
index 42e5ab38f50..ecf961f133b 100644
--- a/python/cudf/cudf/tests/test_cuda_array_interface.py
+++ b/python/cudf/cudf/tests/test_cuda_array_interface.py
@@ -10,7 +10,7 @@
 from numba import cuda
 
 import cudf
-from cudf.tests.utils import DATETIME_TYPES, NUMERIC_TYPES, assert_eq
+from cudf.testing._utils import DATETIME_TYPES, NUMERIC_TYPES, assert_eq
 
 
 @pytest.mark.parametrize("dtype", NUMERIC_TYPES + DATETIME_TYPES)
@@ -171,6 +171,9 @@ def test_column_from_ephemeral_cupy_try_lose_reference():
 
 def test_cuda_array_interface_pytorch():
     torch = pytest.importorskip("torch")
+    if not torch.cuda.is_available():
+        pytest.skip("need gpu version of pytorch to be installed")
+
     series = cudf.Series([1, -1, 10, -56])
     tensor = torch.tensor(series)
     got = cudf.Series(tensor)
diff --git a/python/cudf/cudf/tests/test_custom_accessor.py b/python/cudf/cudf/tests/test_custom_accessor.py
index d72b5875677..16e5b345ce2 100644
--- a/python/cudf/cudf/tests/test_custom_accessor.py
+++ b/python/cudf/cudf/tests/test_custom_accessor.py
@@ -2,9 +2,9 @@
 
 import pandas as pd
 import pytest
-import cudf as gd
 
-from cudf.tests.utils import assert_eq
+import cudf as gd
+from cudf.testing._utils import assert_eq
 
 
 @gd.api.extensions.register_dataframe_accessor("point")
diff --git a/python/cudf/cudf/tests/test_cut.py b/python/cudf/cudf/tests/test_cut.py
index 926826ac188..710df78e36b 100644
--- a/python/cudf/cudf/tests/test_cut.py
+++ b/python/cudf/cudf/tests/test_cut.py
@@ -4,11 +4,12 @@
 Test related to Cut
 """
 
-import pandas as pd
 import numpy as np
-from cudf.core.cut import cut
+import pandas as pd
 import pytest
-from cudf.tests.utils import assert_eq
+
+from cudf.core.cut import cut
+from cudf.testing._utils import assert_eq
 
 
 @pytest.mark.parametrize(
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 853cfe2e88e..951062f2b61 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -20,8 +20,8 @@
 import cudf
 from cudf.core._compat import PANDAS_GE_110, PANDAS_GE_120
 from cudf.core.column import column
-from cudf.tests import utils
-from cudf.tests.utils import (
+from cudf.testing import _utils as utils
+from cudf.testing._utils import (
     ALL_TYPES,
     DATETIME_TYPES,
     NUMERIC_TYPES,
@@ -736,6 +736,17 @@ def test_dataframe_astype(nelem):
     np.testing.assert_equal(df["a"].to_array(), df["b"].to_array())
 
 
+def test_astype_dict():
+    gdf = cudf.DataFrame({"a": [1, 2, 3], "b": ["1", "2", "3"]})
+    pdf = gdf.to_pandas()
+
+    assert_eq(pdf.astype({"a": "str"}), gdf.astype({"a": "str"}))
+    assert_eq(
+        pdf.astype({"a": "str", "b": np.int64}),
+        gdf.astype({"a": "str", "b": np.int64}),
+    )
+
+
 @pytest.mark.parametrize("nelem", [0, 100])
 def test_index_astype(nelem):
     df = cudf.DataFrame()
@@ -8153,6 +8164,16 @@ def assert_local_eq(actual, df, expected, host_columns):
     assert_local_eq(actual, df, expected, host_columns)
 
 
+def test_dataframe_constructor_column_index_only():
+    columns = ["a", "b", "c"]
+    index = ["r1", "r2", "r3"]
+
+    gdf = cudf.DataFrame(index=index, columns=columns)
+    assert not id(gdf["a"]._column) == id(gdf["b"]._column) and not id(
+        gdf["b"]._column
+    ) == id(gdf["c"]._column)
+
+
 @pytest.mark.parametrize(
     "data",
     [
diff --git a/python/cudf/cudf/tests/test_dataframe_copy.py b/python/cudf/cudf/tests/test_dataframe_copy.py
index 35788e660ea..5b258c760b3 100644
--- a/python/cudf/cudf/tests/test_dataframe_copy.py
+++ b/python/cudf/cudf/tests/test_dataframe_copy.py
@@ -7,7 +7,7 @@
 from numba import cuda
 
 from cudf.core.dataframe import DataFrame
-from cudf.tests.utils import ALL_TYPES, assert_eq
+from cudf.testing._utils import ALL_TYPES, assert_eq
 
 """
 DataFrame copy expectations
diff --git a/python/cudf/cudf/tests/test_datasets.py b/python/cudf/cudf/tests/test_datasets.py
index a603a6b4658..b7bc89f008d 100644
--- a/python/cudf/cudf/tests/test_datasets.py
+++ b/python/cudf/cudf/tests/test_datasets.py
@@ -1,7 +1,7 @@
 import numpy as np
 
 import cudf as gd
-from cudf.tests.utils import assert_eq
+from cudf.testing._utils import assert_eq
 
 
 def test_dataset_timeseries():
diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py
index 647ff5250ba..12e169e699d 100644
--- a/python/cudf/cudf/tests/test_datetime.py
+++ b/python/cudf/cudf/tests/test_datetime.py
@@ -14,7 +14,7 @@
 import cudf
 from cudf.core import DataFrame, Series
 from cudf.core.index import DatetimeIndex
-from cudf.tests.utils import (
+from cudf.testing._utils import (
     DATETIME_TYPES,
     NUMERIC_TYPES,
     assert_eq,
@@ -82,6 +82,8 @@ def numerical_data():
     "second",
     "weekday",
     "dayofweek",
+    "dayofyear",
+    "day_of_year",
 ]
 
 
diff --git a/python/cudf/cudf/tests/test_decimal.py b/python/cudf/cudf/tests/test_decimal.py
index bf1845b9315..d2de44b0c8f 100644
--- a/python/cudf/cudf/tests/test_decimal.py
+++ b/python/cudf/cudf/tests/test_decimal.py
@@ -8,9 +8,9 @@
 import pytest
 
 import cudf
-from cudf.core.column import DecimalColumn, NumericalColumn
+from cudf.core.column import Decimal32Column, Decimal64Column, NumericalColumn
 from cudf.core.dtypes import Decimal64Dtype
-from cudf.tests.utils import (
+from cudf.testing._utils import (
     FLOAT_TYPES,
     INTEGER_TYPES,
     NUMERIC_TYPES,
@@ -18,39 +18,51 @@
     assert_eq,
 )
 
-
-@pytest.mark.parametrize(
-    "data",
-    [
-        [Decimal("1.1"), Decimal("2.2"), Decimal("3.3"), Decimal("4.4")],
-        [Decimal("-1.1"), Decimal("2.2"), Decimal("3.3"), Decimal("4.4")],
-        [1],
-        [-1],
-        [1, 2, 3, 4],
-        [42, 1729, 4104],
-        [1, 2, None, 4],
-        [None, None, None],
-        [],
-    ],
-)
-@pytest.mark.parametrize(
-    "typ",
-    [
-        pa.decimal128(precision=4, scale=2),
-        pa.decimal128(precision=5, scale=3),
-        pa.decimal128(precision=6, scale=4),
-    ],
-)
-def test_round_trip_decimal_column(data, typ):
-    pa_arr = pa.array(data, type=typ)
-    col = DecimalColumn.from_arrow(pa_arr)
-    assert pa_arr.equals(col.to_arrow())
+data_ = [
+    [Decimal("1.1"), Decimal("2.2"), Decimal("3.3"), Decimal("4.4")],
+    [Decimal("-1.1"), Decimal("2.2"), Decimal("3.3"), Decimal("4.4")],
+    [1],
+    [-1],
+    [1, 2, 3, 4],
+    [42, 1729, 4104],
+    [1, 2, None, 4],
+    [None, None, None],
+    [],
+]
+typ_ = [
+    pa.decimal128(precision=4, scale=2),
+    pa.decimal128(precision=5, scale=3),
+    pa.decimal128(precision=6, scale=4),
+]
+
+
+@pytest.mark.parametrize("data_", data_)
+@pytest.mark.parametrize("typ_", typ_)
+def test_round_trip_decimal64_column(data_, typ_):
+    pa_arr = pa.array(data_, type=typ_)
+    col_64 = Decimal64Column.from_arrow(pa_arr)
+    assert pa_arr.equals(col_64.to_arrow())
+
+
+@pytest.mark.parametrize("data_", data_)
+@pytest.mark.parametrize("typ_", typ_)
+def test_round_trip_decimal32_column(data_, typ_):
+    pa_arr = pa.array(data_, type=typ_)
+    col_32 = Decimal32Column.from_arrow(pa_arr)
+    assert pa_arr.equals(col_32.to_arrow())
+
+
+def test_from_arrow_max_precision_decimal64():
+    with pytest.raises(ValueError):
+        Decimal64Column.from_arrow(
+            pa.array([1, 2, 3], type=pa.decimal128(scale=0, precision=19))
+        )
 
 
-def test_from_arrow_max_precision():
+def test_from_arrow_max_precision_decimal32():
     with pytest.raises(ValueError):
-        DecimalColumn.from_arrow(
-            pa.array([1, 2, 3], type=pa.decimal128(scale=0, precision=19))
+        Decimal32Column.from_arrow(
+            pa.array([1, 2, 3], type=pa.decimal128(scale=0, precision=10))
         )
 
 
@@ -84,7 +96,7 @@ def test_typecast_from_float_to_decimal(data, from_dtype, to_dtype):
     pa_arr = got.to_arrow().cast(
         pa.decimal128(to_dtype.precision, to_dtype.scale)
     )
-    expected = cudf.Series(DecimalColumn.from_arrow(pa_arr))
+    expected = cudf.Series(Decimal64Column.from_arrow(pa_arr))
 
     got = got.astype(to_dtype)
 
@@ -124,7 +136,7 @@ def test_typecast_from_int_to_decimal(data, from_dtype, to_dtype):
         .cast("float64")
         .cast(pa.decimal128(to_dtype.precision, to_dtype.scale))
     )
-    expected = cudf.Series(DecimalColumn.from_arrow(pa_arr))
+    expected = cudf.Series(Decimal64Column.from_arrow(pa_arr))
 
     got = got.astype(to_dtype)
 
@@ -164,7 +176,7 @@ def test_typecast_to_from_decimal(data, from_dtype, to_dtype):
     pa_arr = got.to_arrow().cast(
         pa.decimal128(to_dtype.precision, to_dtype.scale), safe=False
     )
-    expected = cudf.Series(DecimalColumn.from_arrow(pa_arr))
+    expected = cudf.Series(Decimal64Column.from_arrow(pa_arr))
 
     got = got.astype(to_dtype)
 
diff --git a/python/cudf/cudf/tests/test_dlpack.py b/python/cudf/cudf/tests/test_dlpack.py
index b8175d05137..4b2fca0d12d 100644
--- a/python/cudf/cudf/tests/test_dlpack.py
+++ b/python/cudf/cudf/tests/test_dlpack.py
@@ -8,7 +8,7 @@
 import pytest
 
 import cudf
-from cudf.tests.utils import assert_eq
+from cudf.testing._utils import assert_eq
 
 nelems = [0, 3, 10]
 dtype = [np.uint16, np.int32, np.float64]
diff --git a/python/cudf/cudf/tests/test_dropna.py b/python/cudf/cudf/tests/test_dropna.py
index d01627309d6..e1d0c38c760 100644
--- a/python/cudf/cudf/tests/test_dropna.py
+++ b/python/cudf/cudf/tests/test_dropna.py
@@ -5,7 +5,7 @@
 import pytest
 
 import cudf
-from cudf.tests.utils import assert_eq
+from cudf.testing._utils import assert_eq
 
 
 @pytest.mark.parametrize(
diff --git a/python/cudf/cudf/tests/test_dtypes.py b/python/cudf/cudf/tests/test_dtypes.py
index a5895caf49f..41d7f5d215e 100644
--- a/python/cudf/cudf/tests/test_dtypes.py
+++ b/python/cudf/cudf/tests/test_dtypes.py
@@ -14,7 +14,7 @@
     ListDtype,
     StructDtype,
 )
-from cudf.tests.utils import assert_eq
+from cudf.testing._utils import assert_eq
 from cudf.utils.dtypes import np_to_pa_dtype
 
 
diff --git a/python/cudf/cudf/tests/test_duplicates.py b/python/cudf/cudf/tests/test_duplicates.py
index f721b7a28e5..f464ac1a6c2 100644
--- a/python/cudf/cudf/tests/test_duplicates.py
+++ b/python/cudf/cudf/tests/test_duplicates.py
@@ -9,7 +9,7 @@
 
 import cudf
 from cudf import concat
-from cudf.tests.utils import assert_eq, assert_exceptions_equal
+from cudf.testing._utils import assert_eq, assert_exceptions_equal
 
 # TODO: PANDAS 1.0 support
 # Revisit drop_duplicates() tests to update parameters like ignore_index.
diff --git a/python/cudf/cudf/tests/test_factorize.py b/python/cudf/cudf/tests/test_factorize.py
index 48ffef4a11c..3df0031745e 100644
--- a/python/cudf/cudf/tests/test_factorize.py
+++ b/python/cudf/cudf/tests/test_factorize.py
@@ -7,7 +7,7 @@
 
 import cudf
 from cudf.core import DataFrame, Index
-from cudf.tests.utils import assert_eq
+from cudf.testing._utils import assert_eq
 
 
 @pytest.mark.parametrize("ncats,nelem", [(2, 2), (2, 10), (10, 100)])
diff --git a/python/cudf/cudf/tests/test_feather.py b/python/cudf/cudf/tests/test_feather.py
index 525b88fc7ff..6c83ee3c458 100644
--- a/python/cudf/cudf/tests/test_feather.py
+++ b/python/cudf/cudf/tests/test_feather.py
@@ -10,7 +10,7 @@
 import pytest
 
 import cudf
-from cudf.tests.utils import NUMERIC_TYPES, assert_eq
+from cudf.testing._utils import NUMERIC_TYPES, assert_eq
 
 if LooseVersion(pd.__version__) < LooseVersion("0.24"):
     try:
diff --git a/python/cudf/cudf/tests/test_fill.py b/python/cudf/cudf/tests/test_fill.py
index 83d15b36e64..efbe2834486 100644
--- a/python/cudf/cudf/tests/test_fill.py
+++ b/python/cudf/cudf/tests/test_fill.py
@@ -2,7 +2,7 @@
 import pytest
 
 import cudf
-from cudf.tests.utils import assert_eq
+from cudf.testing._utils import assert_eq
 
 
 @pytest.mark.parametrize(
diff --git a/python/cudf/cudf/tests/test_gcs.py b/python/cudf/cudf/tests/test_gcs.py
index 5d287a57df8..99d79e41520 100644
--- a/python/cudf/cudf/tests/test_gcs.py
+++ b/python/cudf/cudf/tests/test_gcs.py
@@ -10,7 +10,7 @@
 import pytest
 
 import cudf
-from cudf.tests.utils import assert_eq
+from cudf.testing._utils import assert_eq
 
 gcsfs = pytest.importorskip("gcsfs")
 
diff --git a/python/cudf/cudf/tests/test_gpu_arrow_parser.py b/python/cudf/cudf/tests/test_gpu_arrow_parser.py
index e3c8e69695d..a088ae9f923 100644
--- a/python/cudf/cudf/tests/test_gpu_arrow_parser.py
+++ b/python/cudf/cudf/tests/test_gpu_arrow_parser.py
@@ -1,4 +1,5 @@
-# Copyright (c) 2018, NVIDIA CORPORATION.
+# Copyright (c) 2018-2021, NVIDIA CORPORATION.
+
 import logging
 
 import numpy as np
@@ -8,7 +9,7 @@
 
 import cudf
 from cudf.comm.gpuarrow import GpuArrowReader
-from cudf.tests.utils import INTEGER_TYPES
+from cudf.testing._utils import INTEGER_TYPES
 
 
 def make_gpu_parse_arrow_data_batch():
diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index f346edb4304..e423a64fe4d 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -15,14 +15,14 @@
 import cudf
 from cudf.core import DataFrame, Series
 from cudf.core._compat import PANDAS_GE_110
-from cudf.tests.dataset_generator import rand_dataframe
-from cudf.tests.utils import (
+from cudf.testing._utils import (
     DATETIME_TYPES,
     SIGNED_TYPES,
     TIMEDELTA_TYPES,
     assert_eq,
     assert_exceptions_equal,
 )
+from cudf.testing.dataset_generator import rand_dataframe
 
 _now = np.datetime64("now")
 _tomorrow = _now + np.timedelta64(1, "D")
diff --git a/python/cudf/cudf/tests/test_hdf.py b/python/cudf/cudf/tests/test_hdf.py
index f908d5f51f5..1bf91a52c2f 100644
--- a/python/cudf/cudf/tests/test_hdf.py
+++ b/python/cudf/cudf/tests/test_hdf.py
@@ -8,7 +8,7 @@
 import pytest
 
 import cudf
-from cudf.tests.utils import DATETIME_TYPES, NUMERIC_TYPES, assert_eq
+from cudf.testing._utils import DATETIME_TYPES, NUMERIC_TYPES, assert_eq
 
 try:
     import tables  # noqa F401
diff --git a/python/cudf/cudf/tests/test_hdfs.py b/python/cudf/cudf/tests/test_hdfs.py
index e3867c620fe..24554f113bb 100644
--- a/python/cudf/cudf/tests/test_hdfs.py
+++ b/python/cudf/cudf/tests/test_hdfs.py
@@ -11,7 +11,7 @@
 from pyarrow import orc as orc
 
 import cudf
-from cudf.tests.utils import assert_eq
+from cudf.testing._utils import assert_eq
 
 if not os.environ.get("RUN_HDFS_TESTS"):
     pytestmark = pytest.mark.skip("Env not configured to run HDFS tests")
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index 0d3380343f4..f03454c479a 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -21,7 +21,7 @@
     RangeIndex,
     as_index,
 )
-from cudf.tests.utils import (
+from cudf.testing._utils import (
     FLOAT_TYPES,
     NUMERIC_TYPES,
     OTHER_TYPES,
@@ -317,9 +317,6 @@ def test_index_copy_datetime(name, dtype, deep=True):
     pidx_copy = pidx.copy(name=name, deep=deep, dtype=dtype)
     cidx_copy = cidx.copy(name=name, deep=deep, dtype=dtype)
 
-    # By default, cudf.DatetimeIndex uses [ms] as base unit, pandas uses [ns]
-    if dtype == "int64":
-        cidx_copy = cidx_copy * 1000000
     assert_eq(pidx_copy, cidx_copy)
 
 
diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py
index 1d34f7636da..58d39ff35a6 100644
--- a/python/cudf/cudf/tests/test_indexing.py
+++ b/python/cudf/cudf/tests/test_indexing.py
@@ -9,8 +9,12 @@
 
 import cudf
 from cudf.core._compat import PANDAS_GE_110, PANDAS_GE_120
-from cudf.tests import utils
-from cudf.tests.utils import INTEGER_TYPES, assert_eq, assert_exceptions_equal
+from cudf.testing import _utils as utils
+from cudf.testing._utils import (
+    INTEGER_TYPES,
+    assert_eq,
+    assert_exceptions_equal,
+)
 
 index_dtypes = INTEGER_TYPES
 
diff --git a/python/cudf/cudf/tests/test_interval.py b/python/cudf/cudf/tests/test_interval.py
index c7eafedd409..fc193441113 100644
--- a/python/cudf/cudf/tests/test_interval.py
+++ b/python/cudf/cudf/tests/test_interval.py
@@ -4,7 +4,7 @@
 import pytest
 
 import cudf
-from cudf.tests.utils import assert_eq
+from cudf.testing._utils import assert_eq
 
 
 @pytest.mark.parametrize(
diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py
index 2d8f451abb9..7b56f864272 100644
--- a/python/cudf/cudf/tests/test_joining.py
+++ b/python/cudf/cudf/tests/test_joining.py
@@ -7,7 +7,7 @@
 import cudf
 from cudf.core._compat import PANDAS_GE_120
 from cudf.core.dtypes import CategoricalDtype, Decimal64Dtype
-from cudf.tests.utils import (
+from cudf.testing._utils import (
     INTEGER_TYPES,
     NUMERIC_TYPES,
     assert_eq,
@@ -96,7 +96,7 @@ def assert_join_results_equal(expect, got, how, **kwargs):
             got.sort_values(got.columns.to_list()).reset_index(drop=True),
             **kwargs,
         )
-    elif isinstance(expect, (pd.Index, cudf.BaseIndex)):
+    elif isinstance(expect, (pd.Index, cudf.Index)):
         return assert_eq(expect.sort_values(), got.sort_values(), **kwargs)
     else:
         raise ValueError(f"Not a join result: {type(expect).__name__}")
@@ -1922,3 +1922,193 @@ def test_join_merge_invalid_keys(on, how):
     with pytest.raises(KeyError):
         pd_left.merge(pd_right, on=on)
         gd_left.merge(gd_right, on=on)
+
+
+@pytest.mark.parametrize(
+    "str_data",
+    [[], ["a", "b", "c", "d", "e"], [None, None, None, None, None]],
+)
+@pytest.mark.parametrize("num_keys", [1, 2, 3])
+@pytest.mark.parametrize("how", ["left", "right", "inner", "outer"])
+def test_string_join_key(str_data, num_keys, how):
+    other_data = [1, 2, 3, 4, 5][: len(str_data)]
+
+    pdf = pd.DataFrame()
+    gdf = cudf.DataFrame()
+    for i in range(num_keys):
+        pdf[i] = pd.Series(str_data, dtype="str")
+        gdf[i] = cudf.Series(str_data, dtype="str")
+    pdf["a"] = other_data
+    gdf["a"] = other_data
+
+    pdf2 = pdf.copy()
+    gdf2 = gdf.copy()
+
+    expect = pdf.merge(pdf2, on=list(range(num_keys)), how=how)
+    got = gdf.merge(gdf2, on=list(range(num_keys)), how=how)
+
+    if len(expect) == 0 and len(got) == 0:
+        expect = expect.reset_index(drop=True)
+        got = got[expect.columns]  # reorder columns
+
+    if how == "right":
+        got = got[expect.columns]  # reorder columns
+
+    assert_join_results_equal(expect, got, how=how)
+
+
+@pytest.mark.parametrize(
+    "str_data_nulls",
+    [
+        ["a", "b", "c"],
+        ["a", "b", "f", "g"],
+        ["f", "g", "h", "i", "j"],
+        ["f", "g", "h"],
+        [None, None, None, None, None],
+        [],
+    ],
+)
+def test_string_join_key_nulls(str_data_nulls):
+    str_data = ["a", "b", "c", "d", "e"]
+    other_data = [1, 2, 3, 4, 5]
+
+    other_data_nulls = [6, 7, 8, 9, 10][: len(str_data_nulls)]
+
+    pdf = pd.DataFrame()
+    gdf = cudf.DataFrame()
+    pdf["key"] = pd.Series(str_data, dtype="str")
+    gdf["key"] = cudf.Series(str_data, dtype="str")
+    pdf["vals"] = other_data
+    gdf["vals"] = other_data
+
+    pdf2 = pd.DataFrame()
+    gdf2 = cudf.DataFrame()
+    pdf2["key"] = pd.Series(str_data_nulls, dtype="str")
+    gdf2["key"] = cudf.Series(str_data_nulls, dtype="str")
+    pdf2["vals"] = pd.Series(other_data_nulls, dtype="int64")
+    gdf2["vals"] = cudf.Series(other_data_nulls, dtype="int64")
+
+    expect = pdf.merge(pdf2, on="key", how="left")
+    got = gdf.merge(gdf2, on="key", how="left")
+    got["vals_y"] = got["vals_y"].fillna(-1)
+
+    if len(expect) == 0 and len(got) == 0:
+        expect = expect.reset_index(drop=True)
+        got = got[expect.columns]
+
+    expect["vals_y"] = expect["vals_y"].fillna(-1).astype("int64")
+
+    assert_join_results_equal(expect, got, how="left")
+
+
+@pytest.mark.parametrize(
+    "str_data", [[], ["a", "b", "c", "d", "e"], [None, None, None, None, None]]
+)
+@pytest.mark.parametrize("num_cols", [1, 2, 3])
+@pytest.mark.parametrize("how", ["left", "right", "inner", "outer"])
+def test_string_join_non_key(str_data, num_cols, how):
+    other_data = [1, 2, 3, 4, 5][: len(str_data)]
+
+    pdf = pd.DataFrame()
+    gdf = cudf.DataFrame()
+    for i in range(num_cols):
+        pdf[i] = pd.Series(str_data, dtype="str")
+        gdf[i] = cudf.Series(str_data, dtype="str")
+    pdf["a"] = other_data
+    gdf["a"] = other_data
+
+    pdf2 = pdf.copy()
+    gdf2 = gdf.copy()
+
+    expect = pdf.merge(pdf2, on=["a"], how=how)
+    got = gdf.merge(gdf2, on=["a"], how=how)
+
+    if len(expect) == 0 and len(got) == 0:
+        expect = expect.reset_index(drop=True)
+        got = got[expect.columns]
+
+    if how == "right":
+        got = got[expect.columns]  # reorder columns
+
+    assert_join_results_equal(expect, got, how=how)
+
+
+@pytest.mark.parametrize(
+    "str_data_nulls",
+    [
+        ["a", "b", "c"],
+        ["a", "b", "f", "g"],
+        ["f", "g", "h", "i", "j"],
+        ["f", "g", "h"],
+        [None, None, None, None, None],
+        [],
+    ],
+)
+def test_string_join_non_key_nulls(str_data_nulls):
+    str_data = ["a", "b", "c", "d", "e"]
+    other_data = [1, 2, 3, 4, 5]
+
+    other_data_nulls = [6, 7, 8, 9, 10][: len(str_data_nulls)]
+
+    pdf = pd.DataFrame()
+    gdf = cudf.DataFrame()
+    pdf["vals"] = pd.Series(str_data, dtype="str")
+    gdf["vals"] = cudf.Series(str_data, dtype="str")
+    pdf["key"] = other_data
+    gdf["key"] = other_data
+
+    pdf2 = pd.DataFrame()
+    gdf2 = cudf.DataFrame()
+    pdf2["vals"] = pd.Series(str_data_nulls, dtype="str")
+    gdf2["vals"] = cudf.Series(str_data_nulls, dtype="str")
+    pdf2["key"] = pd.Series(other_data_nulls, dtype="int64")
+    gdf2["key"] = cudf.Series(other_data_nulls, dtype="int64")
+
+    expect = pdf.merge(pdf2, on="key", how="left")
+    got = gdf.merge(gdf2, on="key", how="left")
+
+    if len(expect) == 0 and len(got) == 0:
+        expect = expect.reset_index(drop=True)
+        got = got[expect.columns]
+
+    assert_join_results_equal(expect, got, how="left")
+
+
+def test_string_join_values_nulls():
+    left_dict = [
+        {"b": "MATCH 1", "a": 1.0},
+        {"b": "MATCH 1", "a": 1.0},
+        {"b": "LEFT NO MATCH 1", "a": -1.0},
+        {"b": "MATCH 2", "a": 2.0},
+        {"b": "MATCH 2", "a": 2.0},
+        {"b": "MATCH 1", "a": 1.0},
+        {"b": "MATCH 1", "a": 1.0},
+        {"b": "MATCH 2", "a": 2.0},
+        {"b": "MATCH 2", "a": 2.0},
+        {"b": "LEFT NO MATCH 2", "a": -2.0},
+        {"b": "MATCH 3", "a": 3.0},
+        {"b": "MATCH 3", "a": 3.0},
+    ]
+
+    right_dict = [
+        {"b": "RIGHT NO MATCH 1", "c": -1.0},
+        {"b": "MATCH 3", "c": 3.0},
+        {"b": "MATCH 2", "c": 2.0},
+        {"b": "RIGHT NO MATCH 2", "c": -2.0},
+        {"b": "RIGHT NO MATCH 3", "c": -3.0},
+        {"b": "MATCH 1", "c": 1.0},
+    ]
+
+    left_pdf = pd.DataFrame(left_dict)
+    right_pdf = pd.DataFrame(right_dict)
+
+    left_gdf = cudf.DataFrame.from_pandas(left_pdf)
+    right_gdf = cudf.DataFrame.from_pandas(right_pdf)
+
+    expect = left_pdf.merge(right_pdf, how="left", on="b")
+    got = left_gdf.merge(right_gdf, how="left", on="b")
+
+    expect = expect.sort_values(by=["a", "b", "c"]).reset_index(drop=True)
+    got = got.sort_values(by=["a", "b", "c"]).reset_index(drop=True)
+
+    assert_join_results_equal(expect, got, how="left")
diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py
index 2da2cea164f..0b138f446ae 100644
--- a/python/cudf/cudf/tests/test_json.py
+++ b/python/cudf/cudf/tests/test_json.py
@@ -12,7 +12,7 @@
 
 import cudf
 from cudf.core._compat import PANDAS_GE_110
-from cudf.tests.utils import DATETIME_TYPES, NUMERIC_TYPES, assert_eq
+from cudf.testing._utils import DATETIME_TYPES, NUMERIC_TYPES, assert_eq
 
 
 def make_numeric_dataframe(nrows, dtype):
diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py
index 42541f1e8b1..abd24ddd0fd 100644
--- a/python/cudf/cudf/tests/test_list.py
+++ b/python/cudf/cudf/tests/test_list.py
@@ -1,5 +1,6 @@
 # Copyright (c) 2020-2021, NVIDIA CORPORATION.
 import functools
+import operator
 
 import numpy as np
 import pandas as pd
@@ -9,7 +10,7 @@
 import cudf
 from cudf import NA
 from cudf._lib.copying import get_element
-from cudf.tests.utils import (
+from cudf.testing._utils import (
     DATETIME_TYPES,
     NUMERIC_TYPES,
     TIMEDELTA_TYPES,
@@ -324,6 +325,43 @@ def test_contains_null_search_key(data, expect):
     assert_eq(expect, got)
 
 
+@pytest.mark.parametrize(
+    "row",
+    [
+        [[]],
+        [[1]],
+        [[1, 2]],
+        [[1, 2], [3, 4, 5]],
+        [[1, 2], [], [3, 4, 5]],
+        [[1, 2, None], [3, 4, 5]],
+        [[1, 2, None], None, [3, 4, 5]],
+        [[1, 2, None], None, [], [3, 4, 5]],
+        [[[1, 2], [3, 4]], [[5, 6, 7], [8, 9]]],
+        [[["a", "c", "de", None], None, ["fg"]], [["abc", "de"], None]],
+    ],
+)
+@pytest.mark.parametrize("dropna", [True, False])
+def test_concat_elements(row, dropna):
+    if any(x is None for x in row):
+        if dropna:
+            row = [x for x in row if x is not None]
+            result = functools.reduce(operator.add, row)
+        else:
+            result = None
+    else:
+        result = functools.reduce(operator.add, row)
+
+    expect = pd.Series([result])
+    got = cudf.Series([row]).list.concat(dropna=dropna)
+    assert_eq(expect, got)
+
+
+def test_concat_elements_raise():
+    s = cudf.Series([[1, 2, 3]])  # no nesting
+    with pytest.raises(ValueError):
+        s.list.concat()
+
+
 def test_concatenate_rows_of_lists():
     pdf = pd.DataFrame({"val": [["a", "a"], ["b"], ["c"]]})
     gdf = cudf.from_pandas(pdf)
@@ -457,3 +495,67 @@ def test_serialize_list_columns(data):
     df = cudf.DataFrame(data)
     recreated = df.__class__.deserialize(*df.serialize())
     assert_eq(recreated, df)
+
+
+@pytest.mark.parametrize(
+    "data,item",
+    [
+        (
+            # basic list into a list column
+            [[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+            [0, 0, 0],
+        ),
+        (
+            # nested list into nested list column
+            [
+                [[1, 2, 3], [4, 5, 6]],
+                [[1, 2, 3], [4, 5, 6]],
+                [[1, 2, 3], [4, 5, 6]],
+            ],
+            [[0, 0, 0], [0, 0, 0]],
+        ),
+        (
+            # NA into a list column
+            [[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+            NA,
+        ),
+        (
+            # NA into nested list column
+            [
+                [[1, 2, 3], [4, 5, 6]],
+                [[1, 2, 3], [4, 5, 6]],
+                [[1, 2, 3], [4, 5, 6]],
+            ],
+            NA,
+        ),
+    ],
+)
+def test_listcol_setitem(data, item):
+    sr = cudf.Series(data)
+
+    sr[1] = item
+    data[1] = item
+    expect = cudf.Series(data)
+
+    assert_eq(expect, sr)
+
+
+@pytest.mark.parametrize(
+    "data,item,error",
+    [
+        (
+            [[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+            [[1, 2, 3], [4, 5, 6]],
+            "list nesting level mismatch",
+        ),
+        (
+            [[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+            0,
+            "Can not set 0 into ListColumn",
+        ),
+    ],
+)
+def test_listcol_setitem_error_cases(data, item, error):
+    sr = cudf.Series(data)
+    with pytest.raises(BaseException, match=error):
+        sr[1] = item
diff --git a/python/cudf/cudf/tests/test_monotonic.py b/python/cudf/cudf/tests/test_monotonic.py
index b26887ad6ae..e9c828ec0f5 100644
--- a/python/cudf/cudf/tests/test_monotonic.py
+++ b/python/cudf/cudf/tests/test_monotonic.py
@@ -16,7 +16,7 @@
     RangeIndex,
     StringIndex,
 )
-from cudf.tests.utils import assert_eq
+from cudf.testing._utils import assert_eq
 
 
 @pytest.mark.parametrize("testrange", [(10, 20, 1), (0, -10, -1), (5, 5, 1)])
diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py
index bd78612d6c7..c8e5a9f071b 100644
--- a/python/cudf/cudf/tests/test_multiindex.py
+++ b/python/cudf/cudf/tests/test_multiindex.py
@@ -15,7 +15,7 @@
 import cudf
 from cudf.core.column import as_column
 from cudf.core.index import as_index
-from cudf.tests.utils import assert_eq, assert_exceptions_equal, assert_neq
+from cudf.testing._utils import assert_eq, assert_exceptions_equal, assert_neq
 
 
 def test_multiindex_levels_codes_validation():
diff --git a/python/cudf/cudf/tests/test_numerical.py b/python/cudf/cudf/tests/test_numerical.py
index 12b17447268..7a766a49a62 100644
--- a/python/cudf/cudf/tests/test_numerical.py
+++ b/python/cudf/cudf/tests/test_numerical.py
@@ -6,7 +6,7 @@
 
 import cudf
 from cudf.core._compat import PANDAS_GE_100
-from cudf.tests.utils import NUMERIC_TYPES, assert_eq
+from cudf.testing._utils import NUMERIC_TYPES, assert_eq
 from cudf.utils.dtypes import cudf_dtypes_to_pandas_dtypes
 
 
diff --git a/python/cudf/cudf/tests/test_numpy_interop.py b/python/cudf/cudf/tests/test_numpy_interop.py
index 521840f8a8a..e5efe2f027d 100644
--- a/python/cudf/cudf/tests/test_numpy_interop.py
+++ b/python/cudf/cudf/tests/test_numpy_interop.py
@@ -2,7 +2,7 @@
 import pytest
 
 from cudf.core import DataFrame, Series
-from cudf.tests.utils import assert_eq
+from cudf.testing._utils import assert_eq
 
 
 def test_to_records_noindex():
diff --git a/python/cudf/cudf/tests/test_onehot.py b/python/cudf/cudf/tests/test_onehot.py
index 61195faa4d0..bbec4594e15 100644
--- a/python/cudf/cudf/tests/test_onehot.py
+++ b/python/cudf/cudf/tests/test_onehot.py
@@ -8,7 +8,7 @@
 
 import cudf
 from cudf.core import DataFrame, GenericIndex, Series
-from cudf.tests import utils
+from cudf.testing import _utils as utils
 
 
 def test_onehot_simple():
diff --git a/python/cudf/cudf/tests/test_ops.py b/python/cudf/cudf/tests/test_ops.py
index 8cdef19d9ba..ac3f784ecd4 100644
--- a/python/cudf/cudf/tests/test_ops.py
+++ b/python/cudf/cudf/tests/test_ops.py
@@ -5,7 +5,7 @@
 import pytest
 
 import cudf
-from cudf.tests.utils import assert_eq, gen_rand
+from cudf.testing._utils import assert_eq, gen_rand
 
 
 def test_sqrt_float():
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index bd8131d4673..1a785d28b48 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -3,19 +3,24 @@
 import datetime
 import decimal
 import os
+import random
 from io import BytesIO
 
 import numpy as np
 import pandas as pd
 import pyarrow as pa
 import pyarrow.orc
-import pyorc
+import pyorc as po
 import pytest
 
 import cudf
 from cudf.core.dtypes import Decimal64Dtype
 from cudf.io.orc import ORCWriter
-from cudf.tests.utils import assert_eq, gen_rand_series, supported_numpy_dtypes
+from cudf.testing._utils import (
+    assert_eq,
+    gen_rand_series,
+    supported_numpy_dtypes,
+)
 
 
 @pytest.fixture(scope="module")
@@ -300,7 +305,7 @@ def test_orc_read_skiprows(tmpdir):
         {"a": [1, 0, 1, 0, None, 1, 1, 1, 0, None, 0, 0, 1, 1, 1, 1]},
         dtype=pd.BooleanDtype(),
     )
-    writer = pyorc.Writer(buff, pyorc.Struct(a=pyorc.Boolean()))
+    writer = po.Writer(buff, po.Struct(a=po.Boolean()))
     tuples = list(
         map(
             lambda x: (None,) if x[0] is pd.NA else x,
@@ -838,6 +843,159 @@ def test_orc_string_stream_offset_issue():
     assert_eq(df, cudf.read_orc(buffer))
 
 
+def generate_list_struct_buff(size=28000):
+    rd = random.Random(0)
+    np.random.seed(seed=0)
+
+    buff = BytesIO()
+
+    schema = {
+        "lvl3_list": po.Array(po.Array(po.Array(po.BigInt()))),
+        "lvl1_list": po.Array(po.BigInt()),
+        "lvl1_struct": po.Struct(**{"a": po.BigInt(), "b": po.BigInt()}),
+        "lvl2_struct": po.Struct(
+            **{
+                "a": po.BigInt(),
+                "lvl1_struct": po.Struct(
+                    **{"c": po.BigInt(), "d": po.BigInt()}
+                ),
+            }
+        ),
+        "list_nests_struct": po.Array(
+            po.Array(po.Struct(**{"a": po.BigInt(), "b": po.BigInt()}))
+        ),
+        "struct_nests_list": po.Struct(
+            **{
+                "struct": po.Struct(**{"a": po.BigInt(), "b": po.BigInt()}),
+                "list": po.Array(po.BigInt()),
+            }
+        ),
+    }
+
+    schema = po.Struct(**schema)
+
+    lvl3_list = [
+        [
+            [
+                [
+                    rd.choice([None, np.random.randint(1, 3)])
+                    for z in range(np.random.randint(1, 3))
+                ]
+                for z in range(np.random.randint(0, 3))
+            ]
+            for y in range(np.random.randint(0, 3))
+        ]
+        for x in range(size)
+    ]
+    lvl1_list = [
+        [
+            rd.choice([None, np.random.randint(0, 3)])
+            for y in range(np.random.randint(1, 4))
+        ]
+        for x in range(size)
+    ]
+    lvl1_struct = [
+        (np.random.randint(0, 3), np.random.randint(0, 3)) for x in range(size)
+    ]
+    lvl2_struct = [
+        (
+            rd.choice([None, np.random.randint(0, 3)]),
+            (
+                rd.choice([None, np.random.randint(0, 3)]),
+                np.random.randint(0, 3),
+            ),
+        )
+        for x in range(size)
+    ]
+    list_nests_struct = [
+        [
+            [rd.choice(lvl1_struct), rd.choice(lvl1_struct)]
+            for y in range(np.random.randint(1, 4))
+        ]
+        for x in range(size)
+    ]
+    struct_nests_list = [(lvl1_struct[x], lvl1_list[x]) for x in range(size)]
+
+    df = pd.DataFrame(
+        {
+            "lvl3_list": lvl3_list,
+            "lvl1_list": lvl1_list,
+            "lvl1_struct": lvl1_struct,
+            "lvl2_struct": lvl2_struct,
+            "list_nests_struct": list_nests_struct,
+            "struct_nests_list": struct_nests_list,
+        }
+    )
+
+    writer = po.Writer(buff, schema, stripe_size=1024)
+    tuples = list(
+        map(
+            lambda x: (None,) if x[0] is pd.NA else x,
+            list(df.itertuples(index=False, name=None)),
+        )
+    )
+    writer.writerows(tuples)
+    writer.close()
+
+    return buff
+
+
+list_struct_buff = generate_list_struct_buff()
+
+
+@pytest.mark.parametrize(
+    "columns",
+    [
+        None,
+        ["lvl3_list", "list_nests_struct", "lvl2_struct", "struct_nests_list"],
+        ["lvl2_struct", "lvl1_struct"],
+    ],
+)
+@pytest.mark.parametrize("num_rows", [0, 15, 1005, 10561, 28000])
+@pytest.mark.parametrize("use_index", [True, False])
+@pytest.mark.parametrize("skip_rows", [0, 101, 1007, 27000])
+def test_lists_struct_nests(
+    columns, num_rows, use_index, skip_rows,
+):
+
+    has_lists = (
+        any("list" in col_name for col_name in columns) if columns else True
+    )
+
+    if has_lists and skip_rows > 0:
+        with pytest.raises(
+            RuntimeError, match="skip_rows is not supported by list column"
+        ):
+            cudf.read_orc(
+                list_struct_buff,
+                columns=columns,
+                num_rows=num_rows,
+                use_index=use_index,
+                skiprows=skip_rows,
+            )
+    else:
+        gdf = cudf.read_orc(
+            list_struct_buff,
+            columns=columns,
+            num_rows=num_rows,
+            use_index=use_index,
+            skiprows=skip_rows,
+        )
+
+        pyarrow_tbl = pyarrow.orc.ORCFile(list_struct_buff).read()
+
+        pyarrow_tbl = (
+            pyarrow_tbl[skip_rows : skip_rows + num_rows]
+            if columns is None
+            else pyarrow_tbl.select(columns)[skip_rows : skip_rows + num_rows]
+        )
+
+        if num_rows > 0:
+            assert_eq(True, pyarrow_tbl.equals(gdf.to_arrow()))
+        else:
+            assert_eq(pyarrow_tbl.to_pandas(), gdf)
+
+
 @pytest.mark.parametrize(
     "data", [["_col0"], ["FakeName", "_col0", "TerriblyFakeColumnName"]]
 )
@@ -876,3 +1034,14 @@ def test_orc_reader_decimal_invalid_column(datadir, data):
     # Since the `decimal_cols_as_float` column name
     # is invalid, this should be a decimal
     assert_eq(pdf, gdf)
+
+
+# This test case validates the issue raised in #8665,
+# please check the issue for more details.
+def test_orc_timestamp_read(datadir):
+    path = datadir / "TestOrcFile.timestamp.issue.orc"
+
+    pdf = pd.read_orc(path)
+    gdf = cudf.read_orc(path)
+
+    assert_eq(pdf, gdf)
diff --git a/python/cudf/cudf/tests/test_pack.py b/python/cudf/cudf/tests/test_pack.py
new file mode 100644
index 00000000000..dab74050437
--- /dev/null
+++ b/python/cudf/cudf/tests/test_pack.py
@@ -0,0 +1,318 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pickle
+import sys
+
+import numpy as np
+import pandas as pd
+
+from cudf._lib.copying import pack, unpack
+from cudf.core import DataFrame, GenericIndex, Series
+from cudf.testing._utils import assert_eq
+
+
+def test_sizeof_packed_dataframe():
+    np.random.seed(0)
+    df = DataFrame()
+    nelem = 1000
+    df["keys"] = hkeys = np.arange(nelem, dtype=np.float64)
+    df["vals"] = hvals = np.random.random(nelem)
+    packed = pack(df)
+
+    nbytes = hkeys.nbytes + hvals.nbytes
+    sizeof = sys.getsizeof(packed)
+    assert sizeof < nbytes
+
+    serialized_nbytes = len(
+        pickle.dumps(packed, protocol=pickle.HIGHEST_PROTOCOL)
+    )
+
+    # assert at least sizeof bytes were serialized
+    assert serialized_nbytes >= sizeof
+
+
+def check_packed_equality(df):
+    # basic
+    assert_packed_frame_equality(df)
+    # sliced
+    assert_packed_frame_equality(df[:-1])
+    assert_packed_frame_equality(df[1:])
+    assert_packed_frame_equality(df[2:-2])
+    # sorted
+    sortvaldf = df.sort_values("vals")
+    assert isinstance(sortvaldf.index, GenericIndex)
+    assert_packed_frame_equality(sortvaldf)
+
+
+def assert_packed_frame_equality(df):
+    pdf = df.to_pandas()
+
+    packed = pack(df)
+    del df
+    unpacked = DataFrame._from_table(unpack(packed))
+
+    assert_eq(unpacked, pdf)
+
+
+def test_packed_dataframe_equality_numeric():
+    np.random.seed(0)
+
+    df = DataFrame()
+    nelem = 10
+    df["keys"] = np.arange(nelem, dtype=np.float64)
+    df["vals"] = np.random.random(nelem)
+
+    check_packed_equality(df)
+
+
+def test_packed_dataframe_equality_categorical():
+    np.random.seed(0)
+
+    df = DataFrame()
+    df["keys"] = pd.Categorical(
+        ["a", "a", "a", "b", "a", "b", "a", "b", "a", "c"]
+    )
+    df["vals"] = np.random.random(len(df))
+
+    check_packed_equality(df)
+
+
+def test_packed_dataframe_equality_list():
+    np.random.seed(0)
+
+    df = DataFrame()
+    df["keys"] = Series(list([i, i + 1, i + 2] for i in range(10)))
+    df["vals"] = np.random.random(len(df))
+
+    check_packed_equality(df)
+
+
+def test_packed_dataframe_equality_struct():
+    np.random.seed(0)
+
+    df = DataFrame()
+    df["keys"] = Series(
+        list({"0": i, "1": i + 1, "2": i + 2} for i in range(10))
+    )
+    df["vals"] = np.random.random(len(df))
+
+    check_packed_equality(df)
+
+
+def check_packed_unique_pointers(df):
+    # basic
+    assert_packed_frame_unique_pointers(df)
+    # sliced
+    assert_packed_frame_unique_pointers(df[:-1])
+    assert_packed_frame_unique_pointers(df[1:])
+    assert_packed_frame_unique_pointers(df[2:-2])
+    # sorted
+    sortvaldf = df.sort_values("vals")
+    assert isinstance(sortvaldf.index, GenericIndex)
+    assert_packed_frame_unique_pointers(sortvaldf)
+
+
+def assert_packed_frame_unique_pointers(df):
+    unpacked = unpack(pack(df))
+
+    for col in df:
+        if df._data[col].data:
+            assert df._data[col].data.ptr != unpacked._data[col].data.ptr
+
+
+def test_packed_dataframe_unique_pointers_numeric():
+    np.random.seed(0)
+
+    df = DataFrame()
+    nelem = 10
+    df["keys"] = np.arange(nelem, dtype=np.float64)
+    df["vals"] = np.random.random(nelem)
+
+    check_packed_unique_pointers(df)
+
+
+def test_packed_dataframe_unique_pointers_categorical():
+    np.random.seed(0)
+
+    df = DataFrame()
+    df["keys"] = pd.Categorical(
+        ["a", "a", "a", "b", "a", "b", "a", "b", "a", "c"]
+    )
+    df["vals"] = np.random.random(len(df))
+
+    check_packed_unique_pointers(df)
+
+
+def test_packed_dataframe_unique_pointers_list():
+    np.random.seed(0)
+
+    df = DataFrame()
+    df["keys"] = Series(list([i, i + 1, i + 2] for i in range(10)))
+    df["vals"] = np.random.random(len(df))
+
+    check_packed_unique_pointers(df)
+
+
+def test_packed_dataframe_unique_pointers_struct():
+    np.random.seed(0)
+
+    df = DataFrame()
+    df["keys"] = Series(
+        list({"0": i, "1": i + 1, "2": i + 2} for i in range(10))
+    )
+    df["vals"] = np.random.random(len(df))
+
+    check_packed_unique_pointers(df)
+
+
+def check_packed_pickled_equality(df):
+    # basic
+    assert_packed_frame_picklable(df)
+    # sliced
+    assert_packed_frame_picklable(df[:-1])
+    assert_packed_frame_picklable(df[1:])
+    assert_packed_frame_picklable(df[2:-2])
+    # sorted
+    sortvaldf = df.sort_values("vals")
+    assert isinstance(sortvaldf.index, GenericIndex)
+    assert_packed_frame_picklable(sortvaldf)
+    # out-of-band
+    if pickle.HIGHEST_PROTOCOL >= 5:
+        buffers = []
+        serialbytes = pickle.dumps(
+            pack(df), protocol=5, buffer_callback=buffers.append
+        )
+        for b in buffers:
+            assert isinstance(b, pickle.PickleBuffer)
+        loaded = DataFrame._from_table(
+            unpack(pickle.loads(serialbytes, buffers=buffers))
+        )
+        assert_eq(loaded, df)
+
+
+def assert_packed_frame_picklable(df):
+    serialbytes = pickle.dumps(pack(df))
+    loaded = DataFrame._from_table(unpack(pickle.loads(serialbytes)))
+    assert_eq(loaded, df)
+
+
+def test_pickle_packed_dataframe_numeric():
+    np.random.seed(0)
+
+    df = DataFrame()
+    nelem = 10
+    df["keys"] = np.arange(nelem, dtype=np.float64)
+    df["vals"] = np.random.random(nelem)
+
+    check_packed_pickled_equality(df)
+
+
+def test_pickle_packed_dataframe_categorical():
+    np.random.seed(0)
+
+    df = DataFrame()
+    df["keys"] = pd.Categorical(
+        ["a", "a", "a", "b", "a", "b", "a", "b", "a", "c"]
+    )
+    df["vals"] = np.random.random(len(df))
+
+    check_packed_pickled_equality(df)
+
+
+def test_pickle_packed_dataframe_list():
+    np.random.seed(0)
+
+    df = DataFrame()
+    df["keys"] = Series(list([i, i + 1, i + 2] for i in range(10)))
+    df["vals"] = np.random.random(len(df))
+
+    check_packed_pickled_equality(df)
+
+
+def test_pickle_packed_dataframe_struct():
+    np.random.seed(0)
+
+    df = DataFrame()
+    df["keys"] = Series(
+        list({"0": i, "1": i + 1, "2": i + 2} for i in range(10))
+    )
+    df["vals"] = np.random.random(len(df))
+
+    check_packed_pickled_equality(df)
+
+
+def check_packed_serialized_equality(df):
+    # basic
+    assert_packed_frame_serializable(df)
+    # sliced
+    assert_packed_frame_serializable(df[:-1])
+    assert_packed_frame_serializable(df[1:])
+    assert_packed_frame_serializable(df[2:-2])
+    # sorted
+    sortvaldf = df.sort_values("vals")
+    assert isinstance(sortvaldf.index, GenericIndex)
+    assert_packed_frame_serializable(sortvaldf)
+
+
+def assert_packed_frame_serializable(df):
+    packed = pack(df)
+    header, frames = packed.serialize()
+    loaded = DataFrame._from_table(unpack(packed.deserialize(header, frames)))
+    assert_eq(loaded, df)
+
+
+def test_serialize_packed_dataframe_numeric():
+    np.random.seed(0)
+
+    df = DataFrame()
+    nelem = 10
+    df["keys"] = np.arange(nelem, dtype=np.float64)
+    df["vals"] = np.random.random(nelem)
+
+    check_packed_serialized_equality(df)
+
+
+def test_serialize_packed_dataframe_categorical():
+    np.random.seed(0)
+
+    df = DataFrame()
+    df["keys"] = pd.Categorical(
+        ["a", "a", "a", "b", "a", "b", "a", "b", "a", "c"]
+    )
+    df["vals"] = np.random.random(len(df))
+
+    check_packed_serialized_equality(df)
+
+
+def test_serialize_packed_dataframe_list():
+    np.random.seed(0)
+
+    df = DataFrame()
+    df["keys"] = Series(list([i, i + 1, i + 2] for i in range(10)))
+    df["vals"] = np.random.random(len(df))
+
+    check_packed_serialized_equality(df)
+
+
+def test_serialize_packed_dataframe_struct():
+    np.random.seed(0)
+
+    df = DataFrame()
+    df["keys"] = Series(
+        list({"0": i, "1": i + 1, "2": i + 2} for i in range(10))
+    )
+    df["vals"] = np.random.random(len(df))
+
+    check_packed_serialized_equality(df)
diff --git a/python/cudf/cudf/tests/test_pandas_interop.py b/python/cudf/cudf/tests/test_pandas_interop.py
index 24c60f12a2f..a8a45fc3c28 100644
--- a/python/cudf/cudf/tests/test_pandas_interop.py
+++ b/python/cudf/cudf/tests/test_pandas_interop.py
@@ -5,7 +5,7 @@
 
 import cudf
 from cudf.core import DataFrame
-from cudf.tests.utils import assert_eq
+from cudf.testing._utils import assert_eq
 
 
 def test_to_pandas():
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 54bf17e4c2b..2d0a4006f44 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -18,8 +18,8 @@
 
 import cudf
 from cudf.io.parquet import ParquetWriter, merge_parquet_filemetadata
-from cudf.tests import dataset_generator as dg
-from cudf.tests.utils import (
+from cudf.testing import dataset_generator as dg
+from cudf.testing._utils import (
     TIMEDELTA_TYPES,
     assert_eq,
     assert_exceptions_equal,
diff --git a/python/cudf/cudf/tests/test_pickling.py b/python/cudf/cudf/tests/test_pickling.py
index ca819c7f59b..48a25fcfadb 100644
--- a/python/cudf/cudf/tests/test_pickling.py
+++ b/python/cudf/cudf/tests/test_pickling.py
@@ -8,7 +8,7 @@
 
 from cudf.core import DataFrame, GenericIndex, Series
 from cudf.core.buffer import Buffer
-from cudf.tests.utils import assert_eq
+from cudf.testing._utils import assert_eq
 
 if sys.version_info < (3, 8):
     try:
diff --git a/python/cudf/cudf/tests/test_quantiles.py b/python/cudf/cudf/tests/test_quantiles.py
index 49a2603b9a3..4055485c49a 100644
--- a/python/cudf/cudf/tests/test_quantiles.py
+++ b/python/cudf/cudf/tests/test_quantiles.py
@@ -1,7 +1,7 @@
 import pandas as pd
 
 import cudf
-from cudf.tests.utils import assert_eq
+from cudf.testing._utils import assert_eq
 
 
 def test_single_q():
diff --git a/python/cudf/cudf/tests/test_query.py b/python/cudf/cudf/tests/test_query.py
index b6915a63947..8dc5df2dd7c 100644
--- a/python/cudf/cudf/tests/test_query.py
+++ b/python/cudf/cudf/tests/test_query.py
@@ -12,7 +12,7 @@
 
 import cudf
 from cudf.core import DataFrame
-from cudf.tests.utils import assert_eq
+from cudf.testing._utils import assert_eq
 from cudf.utils import queryutils
 
 _params_query_parser = []
diff --git a/python/cudf/cudf/tests/test_query_mask.py b/python/cudf/cudf/tests/test_query_mask.py
index 35479f8308c..ab1c085c6c0 100644
--- a/python/cudf/cudf/tests/test_query_mask.py
+++ b/python/cudf/cudf/tests/test_query_mask.py
@@ -3,7 +3,7 @@
 import pytest
 
 import cudf
-from cudf.tests.utils import assert_eq
+from cudf.testing._utils import assert_eq
 
 _data = [
     {"a": [0, 1.0, 2.0, None, np.nan, None, 3, 5]},
diff --git a/python/cudf/cudf/tests/test_rank.py b/python/cudf/cudf/tests/test_rank.py
index c86b2c61aa5..3c98496def3 100644
--- a/python/cudf/cudf/tests/test_rank.py
+++ b/python/cudf/cudf/tests/test_rank.py
@@ -7,7 +7,7 @@
 import pytest
 
 from cudf.core import DataFrame
-from cudf.tests.utils import assert_eq, assert_exceptions_equal
+from cudf.testing._utils import assert_eq, assert_exceptions_equal
 
 
 @pytest.fixture
diff --git a/python/cudf/cudf/tests/test_reductions.py b/python/cudf/cudf/tests/test_reductions.py
index 0fa09bc5df7..7cbc56f943c 100644
--- a/python/cudf/cudf/tests/test_reductions.py
+++ b/python/cudf/cudf/tests/test_reductions.py
@@ -13,8 +13,8 @@
 import cudf
 from cudf.core import Series
 from cudf.core.dtypes import Decimal64Dtype
-from cudf.tests import utils
-from cudf.tests.utils import NUMERIC_TYPES, assert_eq, gen_rand
+from cudf.testing import _utils as utils
+from cudf.testing._utils import NUMERIC_TYPES, assert_eq, gen_rand
 
 params_dtype = NUMERIC_TYPES
 
diff --git a/python/cudf/cudf/tests/test_replace.py b/python/cudf/cudf/tests/test_replace.py
index 6dca539b8d5..b59428779c1 100644
--- a/python/cudf/cudf/tests/test_replace.py
+++ b/python/cudf/cudf/tests/test_replace.py
@@ -1,15 +1,15 @@
 # Copyright (c) 2020-2021, NVIDIA CORPORATION.
 
 import re
+from decimal import Decimal
 
 import numpy as np
 import pandas as pd
 import pytest
-from decimal import Decimal
 
 import cudf
 from cudf.core.dtypes import Decimal64Dtype
-from cudf.tests.utils import (
+from cudf.testing._utils import (
     INTEGER_TYPES,
     NUMERIC_TYPES,
     assert_eq,
diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py
index 093be41275a..4906349ecba 100644
--- a/python/cudf/cudf/tests/test_repr.py
+++ b/python/cudf/cudf/tests/test_repr.py
@@ -10,7 +10,7 @@
 
 import cudf
 from cudf.core._compat import PANDAS_GE_110
-from cudf.tests import utils
+from cudf.testing import _utils as utils
 from cudf.utils.dtypes import cudf_dtypes_to_pandas_dtypes
 
 repr_categories = utils.NUMERIC_TYPES + ["str", "category", "datetime64[ns]"]
diff --git a/python/cudf/cudf/tests/test_reshape.py b/python/cudf/cudf/tests/test_reshape.py
index b030924779d..0c4313eb47c 100644
--- a/python/cudf/cudf/tests/test_reshape.py
+++ b/python/cudf/cudf/tests/test_reshape.py
@@ -9,7 +9,7 @@
 import cudf
 from cudf import melt as cudf_melt
 from cudf.core._compat import PANDAS_GE_120
-from cudf.tests.utils import (
+from cudf.testing._utils import (
     ALL_TYPES,
     DATETIME_TYPES,
     NUMERIC_TYPES,
diff --git a/python/cudf/cudf/tests/test_rolling.py b/python/cudf/cudf/tests/test_rolling.py
index fcc5591adda..07e7f43c992 100644
--- a/python/cudf/cudf/tests/test_rolling.py
+++ b/python/cudf/cudf/tests/test_rolling.py
@@ -8,7 +8,7 @@
 
 import cudf
 from cudf.core._compat import PANDAS_GE_110
-from cudf.tests.utils import assert_eq
+from cudf.testing._utils import assert_eq
 
 
 @pytest.mark.parametrize(
diff --git a/python/cudf/cudf/tests/test_s3.py b/python/cudf/cudf/tests/test_s3.py
index 2eefcfef7d2..133597b8f19 100644
--- a/python/cudf/cudf/tests/test_s3.py
+++ b/python/cudf/cudf/tests/test_s3.py
@@ -14,7 +14,7 @@
 import pytest
 
 import cudf
-from cudf.tests.utils import assert_eq
+from cudf.testing._utils import assert_eq
 
 moto = pytest.importorskip("moto", minversion="1.3.14")
 boto3 = pytest.importorskip("boto3")
diff --git a/python/cudf/cudf/tests/test_scalar.py b/python/cudf/cudf/tests/test_scalar.py
index 01e6b52f526..605005f41fc 100644
--- a/python/cudf/cudf/tests/test_scalar.py
+++ b/python/cudf/cudf/tests/test_scalar.py
@@ -11,7 +11,7 @@
 import cudf
 from cudf import Scalar as pycudf_scalar
 from cudf._lib.copying import get_element
-from cudf.tests.utils import (
+from cudf.testing._utils import (
     ALL_TYPES,
     DATETIME_TYPES,
     NUMERIC_TYPES,
diff --git a/python/cudf/cudf/tests/test_scan.py b/python/cudf/cudf/tests/test_scan.py
index f7e8c5a8563..0ef7b89a606 100644
--- a/python/cudf/cudf/tests/test_scan.py
+++ b/python/cudf/cudf/tests/test_scan.py
@@ -5,8 +5,13 @@
 import pytest
 
 import cudf
-from cudf.tests.utils import INTEGER_TYPES, NUMERIC_TYPES, assert_eq, gen_rand
 from cudf.core.dtypes import Decimal64Dtype
+from cudf.testing._utils import (
+    INTEGER_TYPES,
+    NUMERIC_TYPES,
+    assert_eq,
+    gen_rand,
+)
 
 params_sizes = [0, 1, 2, 5]
 
diff --git a/python/cudf/cudf/tests/test_search.py b/python/cudf/cudf/tests/test_search.py
index 4c42e2cb50f..c16c6486cd4 100644
--- a/python/cudf/cudf/tests/test_search.py
+++ b/python/cudf/cudf/tests/test_search.py
@@ -5,7 +5,7 @@
 import pytest
 
 import cudf
-from cudf.tests.utils import assert_eq, gen_rand, random_bitmask
+from cudf.testing._utils import assert_eq, gen_rand, random_bitmask
 
 
 @pytest.mark.parametrize("side", ["left", "right"])
diff --git a/python/cudf/cudf/tests/test_serialize.py b/python/cudf/cudf/tests/test_serialize.py
index 4be5adf84de..b436825cf69 100644
--- a/python/cudf/cudf/tests/test_serialize.py
+++ b/python/cudf/cudf/tests/test_serialize.py
@@ -8,8 +8,8 @@
 import pytest
 
 import cudf
-from cudf.tests import utils
-from cudf.tests.utils import assert_eq
+from cudf.testing import _utils as utils
+from cudf.testing._utils import assert_eq
 
 
 @pytest.mark.parametrize(
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index d400a9ce8a9..f3da4275aea 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -9,7 +9,7 @@
 import pytest
 
 import cudf
-from cudf.tests.utils import (
+from cudf.testing._utils import (
     DATETIME_TYPES,
     NUMERIC_TYPES,
     TIMEDELTA_TYPES,
diff --git a/python/cudf/cudf/tests/test_seriesmap.py b/python/cudf/cudf/tests/test_seriesmap.py
index 324074b6021..d4ef3ba235d 100644
--- a/python/cudf/cudf/tests/test_seriesmap.py
+++ b/python/cudf/cudf/tests/test_seriesmap.py
@@ -4,12 +4,12 @@
 from math import floor
 
 import numpy as np
-import cudf
 import pandas as pd
 import pytest
 
+import cudf
 from cudf import Series
-from cudf.tests.utils import assert_eq, assert_exceptions_equal
+from cudf.testing._utils import assert_eq, assert_exceptions_equal
 
 
 def test_series_map_basic():
diff --git a/python/cudf/cudf/tests/test_setitem.py b/python/cudf/cudf/tests/test_setitem.py
index 28cb2568908..c7429f3c246 100644
--- a/python/cudf/cudf/tests/test_setitem.py
+++ b/python/cudf/cudf/tests/test_setitem.py
@@ -6,7 +6,7 @@
 
 import cudf
 from cudf.core._compat import PANDAS_GE_120, PANDAS_LE_122
-from cudf.tests.utils import assert_eq, assert_exceptions_equal
+from cudf.testing._utils import assert_eq, assert_exceptions_equal
 
 
 @pytest.mark.parametrize("df", [pd.DataFrame({"a": [1, 2, 3]})])
diff --git a/python/cudf/cudf/tests/test_sorting.py b/python/cudf/cudf/tests/test_sorting.py
index b90aebc33dc..95942045654 100644
--- a/python/cudf/cudf/tests/test_sorting.py
+++ b/python/cudf/cudf/tests/test_sorting.py
@@ -9,7 +9,7 @@
 
 from cudf.core import DataFrame, Series
 from cudf.core.column import NumericalColumn
-from cudf.tests.utils import (
+from cudf.testing._utils import (
     DATETIME_TYPES,
     NUMERIC_TYPES,
     assert_eq,
diff --git a/python/cudf/cudf/tests/test_sparse_df.py b/python/cudf/cudf/tests/test_sparse_df.py
index 4551f48845f..50c8f3f41a8 100644
--- a/python/cudf/cudf/tests/test_sparse_df.py
+++ b/python/cudf/cudf/tests/test_sparse_df.py
@@ -8,7 +8,7 @@
 
 from cudf.comm.gpuarrow import GpuArrowReader
 from cudf.core import DataFrame, Series
-from cudf.tests.utils import assert_eq
+from cudf.testing._utils import assert_eq
 
 
 def read_data():
diff --git a/python/cudf/cudf/tests/test_stats.py b/python/cudf/cudf/tests/test_stats.py
index 4e07c974280..d4e944848c9 100644
--- a/python/cudf/cudf/tests/test_stats.py
+++ b/python/cudf/cudf/tests/test_stats.py
@@ -9,7 +9,7 @@
 
 import cudf
 from cudf.datasets import randomdata
-from cudf.tests.utils import assert_eq, assert_exceptions_equal
+from cudf.testing._utils import assert_eq, assert_exceptions_equal
 
 params_dtypes = [np.int32, np.uint32, np.float32, np.float64]
 methods = ["min", "max", "sum", "mean", "var", "std"]
diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py
index 58b3996ab5c..a8c00ce031e 100644
--- a/python/cudf/cudf/tests/test_string.py
+++ b/python/cudf/cudf/tests/test_string.py
@@ -18,8 +18,7 @@
 from cudf.core._compat import PANDAS_GE_110
 from cudf.core.column.string import StringColumn
 from cudf.core.index import StringIndex, as_index
-from cudf.tests.test_joining import assert_join_results_equal
-from cudf.tests.utils import (
+from cudf.testing._utils import (
     DATETIME_TYPES,
     NUMERIC_TYPES,
     assert_eq,
@@ -201,11 +200,7 @@ def test_string_astype(dtype):
     ps = pd.Series(data)
     gs = cudf.Series(data)
 
-    # Pandas str --> bool typecasting always returns True if there's a string
-    if dtype.startswith("bool"):
-        expect = ps == "True"
-    else:
-        expect = ps.astype(dtype)
+    expect = ps.astype(dtype)
     got = gs.astype(dtype)
 
     assert_eq(expect, got)
@@ -919,196 +914,6 @@ def test_string_split(data, pat, n, expand):
     assert_eq(expect, got)
 
 
-@pytest.mark.parametrize(
-    "str_data",
-    [[], ["a", "b", "c", "d", "e"], [None, None, None, None, None]],
-)
-@pytest.mark.parametrize("num_keys", [1, 2, 3])
-@pytest.mark.parametrize("how", ["left", "right", "inner", "outer"])
-def test_string_join_key(str_data, num_keys, how):
-    other_data = [1, 2, 3, 4, 5][: len(str_data)]
-
-    pdf = pd.DataFrame()
-    gdf = cudf.DataFrame()
-    for i in range(num_keys):
-        pdf[i] = pd.Series(str_data, dtype="str")
-        gdf[i] = cudf.Series(str_data, dtype="str")
-    pdf["a"] = other_data
-    gdf["a"] = other_data
-
-    pdf2 = pdf.copy()
-    gdf2 = gdf.copy()
-
-    expect = pdf.merge(pdf2, on=list(range(num_keys)), how=how)
-    got = gdf.merge(gdf2, on=list(range(num_keys)), how=how)
-
-    if len(expect) == 0 and len(got) == 0:
-        expect = expect.reset_index(drop=True)
-        got = got[expect.columns]  # reorder columns
-
-    if how == "right":
-        got = got[expect.columns]  # reorder columns
-
-    assert_join_results_equal(expect, got, how=how)
-
-
-@pytest.mark.parametrize(
-    "str_data_nulls",
-    [
-        ["a", "b", "c"],
-        ["a", "b", "f", "g"],
-        ["f", "g", "h", "i", "j"],
-        ["f", "g", "h"],
-        [None, None, None, None, None],
-        [],
-    ],
-)
-def test_string_join_key_nulls(str_data_nulls):
-    str_data = ["a", "b", "c", "d", "e"]
-    other_data = [1, 2, 3, 4, 5]
-
-    other_data_nulls = [6, 7, 8, 9, 10][: len(str_data_nulls)]
-
-    pdf = pd.DataFrame()
-    gdf = cudf.DataFrame()
-    pdf["key"] = pd.Series(str_data, dtype="str")
-    gdf["key"] = cudf.Series(str_data, dtype="str")
-    pdf["vals"] = other_data
-    gdf["vals"] = other_data
-
-    pdf2 = pd.DataFrame()
-    gdf2 = cudf.DataFrame()
-    pdf2["key"] = pd.Series(str_data_nulls, dtype="str")
-    gdf2["key"] = cudf.Series(str_data_nulls, dtype="str")
-    pdf2["vals"] = pd.Series(other_data_nulls, dtype="int64")
-    gdf2["vals"] = cudf.Series(other_data_nulls, dtype="int64")
-
-    expect = pdf.merge(pdf2, on="key", how="left")
-    got = gdf.merge(gdf2, on="key", how="left")
-    got["vals_y"] = got["vals_y"].fillna(-1)
-
-    if len(expect) == 0 and len(got) == 0:
-        expect = expect.reset_index(drop=True)
-        got = got[expect.columns]
-
-    expect["vals_y"] = expect["vals_y"].fillna(-1).astype("int64")
-
-    assert_join_results_equal(expect, got, how="left")
-
-
-@pytest.mark.parametrize(
-    "str_data", [[], ["a", "b", "c", "d", "e"], [None, None, None, None, None]]
-)
-@pytest.mark.parametrize("num_cols", [1, 2, 3])
-@pytest.mark.parametrize("how", ["left", "right", "inner", "outer"])
-def test_string_join_non_key(str_data, num_cols, how):
-    other_data = [1, 2, 3, 4, 5][: len(str_data)]
-
-    pdf = pd.DataFrame()
-    gdf = cudf.DataFrame()
-    for i in range(num_cols):
-        pdf[i] = pd.Series(str_data, dtype="str")
-        gdf[i] = cudf.Series(str_data, dtype="str")
-    pdf["a"] = other_data
-    gdf["a"] = other_data
-
-    pdf2 = pdf.copy()
-    gdf2 = gdf.copy()
-
-    expect = pdf.merge(pdf2, on=["a"], how=how)
-    got = gdf.merge(gdf2, on=["a"], how=how)
-
-    if len(expect) == 0 and len(got) == 0:
-        expect = expect.reset_index(drop=True)
-        got = got[expect.columns]
-
-    if how == "right":
-        got = got[expect.columns]  # reorder columns
-
-    assert_join_results_equal(expect, got, how=how)
-
-
-@pytest.mark.parametrize(
-    "str_data_nulls",
-    [
-        ["a", "b", "c"],
-        ["a", "b", "f", "g"],
-        ["f", "g", "h", "i", "j"],
-        ["f", "g", "h"],
-        [None, None, None, None, None],
-        [],
-    ],
-)
-def test_string_join_non_key_nulls(str_data_nulls):
-    str_data = ["a", "b", "c", "d", "e"]
-    other_data = [1, 2, 3, 4, 5]
-
-    other_data_nulls = [6, 7, 8, 9, 10][: len(str_data_nulls)]
-
-    pdf = pd.DataFrame()
-    gdf = cudf.DataFrame()
-    pdf["vals"] = pd.Series(str_data, dtype="str")
-    gdf["vals"] = cudf.Series(str_data, dtype="str")
-    pdf["key"] = other_data
-    gdf["key"] = other_data
-
-    pdf2 = pd.DataFrame()
-    gdf2 = cudf.DataFrame()
-    pdf2["vals"] = pd.Series(str_data_nulls, dtype="str")
-    gdf2["vals"] = cudf.Series(str_data_nulls, dtype="str")
-    pdf2["key"] = pd.Series(other_data_nulls, dtype="int64")
-    gdf2["key"] = cudf.Series(other_data_nulls, dtype="int64")
-
-    expect = pdf.merge(pdf2, on="key", how="left")
-    got = gdf.merge(gdf2, on="key", how="left")
-
-    if len(expect) == 0 and len(got) == 0:
-        expect = expect.reset_index(drop=True)
-        got = got[expect.columns]
-
-    assert_join_results_equal(expect, got, how="left")
-
-
-def test_string_join_values_nulls():
-    left_dict = [
-        {"b": "MATCH 1", "a": 1.0},
-        {"b": "MATCH 1", "a": 1.0},
-        {"b": "LEFT NO MATCH 1", "a": -1.0},
-        {"b": "MATCH 2", "a": 2.0},
-        {"b": "MATCH 2", "a": 2.0},
-        {"b": "MATCH 1", "a": 1.0},
-        {"b": "MATCH 1", "a": 1.0},
-        {"b": "MATCH 2", "a": 2.0},
-        {"b": "MATCH 2", "a": 2.0},
-        {"b": "LEFT NO MATCH 2", "a": -2.0},
-        {"b": "MATCH 3", "a": 3.0},
-        {"b": "MATCH 3", "a": 3.0},
-    ]
-
-    right_dict = [
-        {"b": "RIGHT NO MATCH 1", "c": -1.0},
-        {"b": "MATCH 3", "c": 3.0},
-        {"b": "MATCH 2", "c": 2.0},
-        {"b": "RIGHT NO MATCH 2", "c": -2.0},
-        {"b": "RIGHT NO MATCH 3", "c": -3.0},
-        {"b": "MATCH 1", "c": 1.0},
-    ]
-
-    left_pdf = pd.DataFrame(left_dict)
-    right_pdf = pd.DataFrame(right_dict)
-
-    left_gdf = cudf.DataFrame.from_pandas(left_pdf)
-    right_gdf = cudf.DataFrame.from_pandas(right_pdf)
-
-    expect = left_pdf.merge(right_pdf, how="left", on="b")
-    got = left_gdf.merge(right_gdf, how="left", on="b")
-
-    expect = expect.sort_values(by=["a", "b", "c"]).reset_index(drop=True)
-    got = got.sort_values(by=["a", "b", "c"]).reset_index(drop=True)
-
-    assert_join_results_equal(expect, got, how="left")
-
-
 @pytest.mark.parametrize(
     "str_data", [[], ["a", "b", "c", "d", "e"], [None, None, None, None, None]]
 )
diff --git a/python/cudf/cudf/tests/test_struct.py b/python/cudf/cudf/tests/test_struct.py
index 938d99ff48a..4f3bb9bda92 100644
--- a/python/cudf/cudf/tests/test_struct.py
+++ b/python/cudf/cudf/tests/test_struct.py
@@ -6,7 +6,8 @@
 import pytest
 
 import cudf
-from cudf.tests.utils import assert_eq
+from cudf.core.dtypes import StructDtype
+from cudf.testing._utils import assert_eq
 
 
 @pytest.mark.parametrize(
@@ -75,3 +76,46 @@ def test_serialize_struct_dtype(fields):
     dtype = cudf.StructDtype(fields)
     recreated = dtype.__class__.deserialize(*dtype.serialize())
     assert recreated == dtype
+
+
+@pytest.mark.parametrize(
+    "series, expected",
+    [
+        (
+            [
+                {"a": "Hello world", "b": []},
+                {"a": "CUDF", "b": [1, 2, 3], "c": 1},
+                {},
+            ],
+            {"a": "Hello world", "b": [], "c": cudf.NA},
+        ),
+        ([{}], {}),
+        (
+            [{"b": True}, {"a": 1, "c": [1, 2, 3], "d": "1", "b": False}],
+            {"a": cudf.NA, "c": cudf.NA, "d": cudf.NA, "b": True},
+        ),
+    ],
+)
+def test_struct_getitem(series, expected):
+    sr = cudf.Series(series)
+    assert sr[0] == expected
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        {"a": 1, "b": "rapids", "c": [1, 2, 3, 4]},
+        {"a": 1, "b": "rapids", "c": [1, 2, 3, 4], "d": cudf.NA},
+        {"a": "Hello"},
+        {"b": [], "c": [1, 2, 3]},
+    ],
+)
+def test_struct_scalar_host_construction(data):
+    slr = cudf.Scalar(data)
+    assert slr.value == data
+    assert list(slr.device_value.value.values()) == list(data.values())
+
+
+def test_struct_scalar_null():
+    slr = cudf.Scalar(cudf.NA, dtype=StructDtype)
+    assert slr.device_value.value is cudf.NA
diff --git a/python/cudf/cudf/tests/test_testing.py b/python/cudf/cudf/tests/test_testing.py
index eee7078433d..b2e5ea70ddc 100644
--- a/python/cudf/cudf/tests/test_testing.py
+++ b/python/cudf/cudf/tests/test_testing.py
@@ -10,7 +10,7 @@
     assert_index_equal,
     assert_series_equal,
 )
-from cudf.tests.utils import NUMERIC_TYPES, OTHER_TYPES, assert_eq
+from cudf.testing._utils import NUMERIC_TYPES, OTHER_TYPES, assert_eq
 
 
 @pytest.mark.parametrize("rdata", [[1, 2, 5], [1, 2, 6], [1, 2, 5, 6]])
diff --git a/python/cudf/cudf/tests/test_text.py b/python/cudf/cudf/tests/test_text.py
index 072fc23abba..6c3fdd4640a 100644
--- a/python/cudf/cudf/tests/test_text.py
+++ b/python/cudf/cudf/tests/test_text.py
@@ -5,7 +5,7 @@
 import pytest
 
 import cudf
-from cudf.tests.utils import assert_eq
+from cudf.testing._utils import assert_eq
 
 
 def test_tokenize():
diff --git a/python/cudf/cudf/tests/test_timedelta.py b/python/cudf/cudf/tests/test_timedelta.py
index 3efc30af01e..a65fdeeb0dd 100644
--- a/python/cudf/cudf/tests/test_timedelta.py
+++ b/python/cudf/cudf/tests/test_timedelta.py
@@ -11,8 +11,8 @@
 
 import cudf
 from cudf.core._compat import PANDAS_GE_120
-from cudf.tests import utils as utils
-from cudf.tests.utils import assert_eq, assert_exceptions_equal
+from cudf.testing import _utils as utils
+from cudf.testing._utils import assert_eq, assert_exceptions_equal
 
 _TIMEDELTA_DATA = [
     [1000000, 200000, 3000000],
diff --git a/python/cudf/cudf/tests/test_transform.py b/python/cudf/cudf/tests/test_transform.py
index ed409de196e..582d5a43edf 100644
--- a/python/cudf/cudf/tests/test_transform.py
+++ b/python/cudf/cudf/tests/test_transform.py
@@ -6,7 +6,7 @@
 import pytest
 
 from cudf.core import Series
-from cudf.tests.utils import NUMERIC_TYPES
+from cudf.testing._utils import NUMERIC_TYPES
 
 supported_types = NUMERIC_TYPES
 
diff --git a/python/cudf/cudf/tests/test_unaops.py b/python/cudf/cudf/tests/test_unaops.py
index f132271cfd8..2089f764724 100644
--- a/python/cudf/cudf/tests/test_unaops.py
+++ b/python/cudf/cudf/tests/test_unaops.py
@@ -10,7 +10,7 @@
 
 import cudf
 from cudf.core import Series
-from cudf.tests import utils
+from cudf.testing import _utils as utils
 
 _unaops = [operator.abs, operator.invert, operator.neg, np.ceil, np.floor]
 
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index 4f0c02f5002..e1ae87e5089 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -14,8 +14,11 @@
 from cudf.api.types import (  # noqa: F401
     _is_non_decimal_numeric_dtype,
     _is_scalar_or_zero_d_array,
+    infer_dtype,
     is_categorical_dtype,
     is_datetime_dtype as is_datetime_dtype,
+    is_decimal32_dtype,
+    is_decimal64_dtype,
     is_decimal_dtype,
     is_integer,
     is_integer_dtype,
@@ -27,6 +30,7 @@
     is_string_dtype,
     is_struct_dtype,
     is_timedelta_dtype,
+    pandas_dtype,
 )
 from cudf.core._compat import PANDAS_GE_120
 
@@ -175,7 +179,9 @@ def cudf_dtype_from_pydata_dtype(dtype):
 
     if is_categorical_dtype(dtype):
         return cudf.core.dtypes.CategoricalDtype
-    elif is_decimal_dtype(dtype):
+    elif is_decimal32_dtype(dtype):
+        return cudf.core.dtypes.Decimal32Dtype
+    elif is_decimal64_dtype(dtype):
         return cudf.core.dtypes.Decimal64Dtype
     elif dtype in cudf._lib.types.np_to_cudf_types:
         return dtype.type
@@ -210,7 +216,7 @@ def cudf_dtype_from_pa_type(typ):
     elif pa.types.is_decimal(typ):
         return cudf.core.dtypes.Decimal64Dtype.from_arrow(typ)
     else:
-        return pd.api.types.pandas_dtype(typ.to_pandas_dtype())
+        return pandas_dtype(typ.to_pandas_dtype())
 
 
 def to_cudf_compatible_scalar(val, dtype=None):
@@ -250,7 +256,7 @@ def to_cudf_compatible_scalar(val, dtype=None):
     elif isinstance(val, pd.Timedelta):
         val = val.to_timedelta64()
 
-    val = pd.api.types.pandas_dtype(type(val)).type(val)
+    val = pandas_dtype(type(val)).type(val)
 
     if dtype is not None:
         val = val.astype(dtype)
diff --git a/python/cudf/requirements/cuda-11.0/dev_requirements.txt b/python/cudf/requirements/cuda-11.0/dev_requirements.txt
index 455258d2e2e..87b3a36f865 100644
--- a/python/cudf/requirements/cuda-11.0/dev_requirements.txt
+++ b/python/cudf/requirements/cuda-11.0/dev_requirements.txt
@@ -10,7 +10,6 @@ cmake-setuptools>=0.1.3
 cython>=0.29,<0.30
 dlpack
 fastavro>=0.22.9
-flatbuffers
 fsspec>=0.6.0
 hypothesis
 mimesis
diff --git a/python/cudf/requirements/cuda-11.2/dev_requirements.txt b/python/cudf/requirements/cuda-11.2/dev_requirements.txt
index a7e5f1c0993..4e9a399f471 100644
--- a/python/cudf/requirements/cuda-11.2/dev_requirements.txt
+++ b/python/cudf/requirements/cuda-11.2/dev_requirements.txt
@@ -10,7 +10,6 @@ cmake-setuptools>=0.1.3
 cython>=0.29,<0.30
 dlpack
 fastavro>=0.22.9
-flatbuffers
 fsspec>=0.6.0
 hypothesis
 mimesis
diff --git a/python/custreamz/custreamz/tests/test_kafka.py b/python/custreamz/custreamz/tests/test_kafka.py
index 059655d4ca0..d29ebf8db8b 100644
--- a/python/custreamz/custreamz/tests/test_kafka.py
+++ b/python/custreamz/custreamz/tests/test_kafka.py
@@ -2,7 +2,7 @@
 import confluent_kafka as ck
 import pytest
 
-from cudf.tests.utils import assert_eq
+from cudf.testing._utils import assert_eq
 
 
 @pytest.mark.parametrize("commit_offset", [-1, 0, 1, 1000])
diff --git a/python/dask_cudf/dask_cudf/groupby.py b/python/dask_cudf/dask_cudf/groupby.py
index 73fe1bd2196..2ec457018d9 100644
--- a/python/dask_cudf/dask_cudf/groupby.py
+++ b/python/dask_cudf/dask_cudf/groupby.py
@@ -62,7 +62,16 @@ def aggregate(self, arg, split_every=None, split_out=1):
             return self.size()
         arg = _redirect_aggs(arg)
 
-        _supported = {"count", "mean", "std", "var", "sum", "min", "max"}
+        _supported = {
+            "count",
+            "mean",
+            "std",
+            "var",
+            "sum",
+            "min",
+            "max",
+            "collect",
+        }
         if (
             isinstance(self.obj, DaskDataFrame)
             and isinstance(self.index, (str, list))
@@ -109,7 +118,16 @@ def aggregate(self, arg, split_every=None, split_out=1):
             return self.size()
         arg = _redirect_aggs(arg)
 
-        _supported = {"count", "mean", "std", "var", "sum", "min", "max"}
+        _supported = {
+            "count",
+            "mean",
+            "std",
+            "var",
+            "sum",
+            "min",
+            "max",
+            "collect",
+        }
         if (
             isinstance(self.obj, DaskDataFrame)
             and isinstance(self.index, (str, list))
@@ -125,7 +143,7 @@ def aggregate(self, arg, split_every=None, split_out=1):
                 sep=self.sep,
                 sort=self.sort,
                 as_index=self.as_index,
-            )
+            )[self._slice]
 
         return super().aggregate(
             arg, split_every=split_every, split_out=split_out
@@ -147,7 +165,7 @@ def groupby_agg(
 
         This aggregation algorithm only supports the following options:
 
-        {"count", "mean", "std", "var", "sum", "min", "max"}
+        {"count", "mean", "std", "var", "sum", "min", "max", "collect"}
 
         This "optimized" approach is more performant than the algorithm
         in `dask.dataframe`, because it allows the cudf backend to
@@ -173,7 +191,7 @@ def groupby_agg(
         # strings (no lists)
         str_cols_out = True
         for col in aggs:
-            if isinstance(aggs[col], str):
+            if isinstance(aggs[col], str) or callable(aggs[col]):
                 aggs[col] = [aggs[col]]
             else:
                 str_cols_out = False
@@ -181,7 +199,16 @@ def groupby_agg(
                 columns.append(col)
 
     # Assert that aggregations are supported
-    _supported = {"count", "mean", "std", "var", "sum", "min", "max"}
+    _supported = {
+        "count",
+        "mean",
+        "std",
+        "var",
+        "sum",
+        "min",
+        "max",
+        "collect",
+    }
     if not _is_supported(aggs, _supported):
         raise ValueError(
             f"Supported aggs include {_supported} for groupby_agg API. "
@@ -282,7 +309,13 @@ def groupby_agg(
 def _redirect_aggs(arg):
     """ Redirect aggregations to their corresponding name in cuDF
     """
-    redirects = {sum: "sum", max: "max", min: "min"}
+    redirects = {
+        sum: "sum",
+        max: "max",
+        min: "min",
+        list: "collect",
+        "list": "collect",
+    }
     if isinstance(arg, dict):
         new_arg = dict()
         for col in arg:
@@ -400,6 +433,8 @@ def _tree_node_agg(dfs, gb_cols, split_out, dropna, sort, sep):
             agg_dict[col] = ["sum"]
         elif agg in ("min", "max"):
             agg_dict[col] = [agg]
+        elif agg == "collect":
+            agg_dict[col] = ["collect"]
         else:
             raise ValueError(f"Unexpected aggregation: {agg}")
 
@@ -478,6 +513,9 @@ def _finalize_gb_agg(
                 gb.drop(columns=[sum_name], inplace=True)
             if "count" not in agg_list:
                 gb.drop(columns=[count_name], inplace=True)
+        if "collect" in agg_list:
+            collect_name = _make_name(col, "collect", sep=sep)
+            gb[collect_name] = gb[collect_name].list.concat()
 
     # Ensure sorted keys if `sort=True`
     if sort:
diff --git a/python/dask_cudf/dask_cudf/tests/test_accessor.py b/python/dask_cudf/dask_cudf/tests/test_accessor.py
index 48e0d022a52..94e0169bdf9 100644
--- a/python/dask_cudf/dask_cudf/tests/test_accessor.py
+++ b/python/dask_cudf/dask_cudf/tests/test_accessor.py
@@ -5,11 +5,11 @@
 
 from dask import dataframe as dd
 
-from cudf import DataFrame, Series
-from cudf.tests.utils import assert_eq, does_not_raise
-
 import dask_cudf as dgd
 
+from cudf import DataFrame, Series
+from cudf.testing._utils import assert_eq, does_not_raise
+
 #############################################################################
 #                        Datetime Accessor                                  #
 #############################################################################
diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py
index 2f73534b45a..cf5203a22e5 100644
--- a/python/dask_cudf/dask_cudf/tests/test_core.py
+++ b/python/dask_cudf/dask_cudf/tests/test_core.py
@@ -706,7 +706,7 @@ def test_dataframe_set_index():
 
     pddf = dd.from_pandas(pdf, npartitions=4)
     pddf = pddf.set_index("str")
-    from cudf.tests.utils import assert_eq
+    from cudf.testing._utils import assert_eq
 
     assert_eq(ddf.compute(), pddf.compute())
 
diff --git a/python/dask_cudf/dask_cudf/tests/test_distributed.py b/python/dask_cudf/dask_cudf/tests/test_distributed.py
index 85354704902..876a66f78d7 100644
--- a/python/dask_cudf/dask_cudf/tests/test_distributed.py
+++ b/python/dask_cudf/dask_cudf/tests/test_distributed.py
@@ -7,7 +7,7 @@
 from distributed.utils_test import loop  # noqa: F401
 
 import cudf
-from cudf.tests.utils import assert_eq
+from cudf.testing._utils import assert_eq
 
 import dask_cudf
 
diff --git a/python/dask_cudf/dask_cudf/tests/test_groupby.py b/python/dask_cudf/dask_cudf/tests/test_groupby.py
index e3a3045dcc7..5cb681fff55 100644
--- a/python/dask_cudf/dask_cudf/tests/test_groupby.py
+++ b/python/dask_cudf/dask_cudf/tests/test_groupby.py
@@ -48,10 +48,7 @@ def test_groupby_basic_aggs(aggregation):
     "func",
     [
         lambda df: df.groupby("x").agg({"y": "max"}),
-        pytest.param(
-            lambda df: df.groupby("x").y.agg(["sum", "max"]),
-            marks=pytest.mark.skip,
-        ),
+        lambda df: df.groupby("x").y.agg(["sum", "max"]),
     ],
 )
 def test_groupby_agg(func):
@@ -98,7 +95,6 @@ def test_groupby_agg_empty_partition(tmpdir, split_out):
     dd.assert_eq(gb.compute().sort_index(), expect)
 
 
-@pytest.mark.xfail(reason="cudf issues")
 @pytest.mark.parametrize(
     "func",
     [lambda df: df.groupby("x").std(), lambda df: df.groupby("x").y.std()],
@@ -115,12 +111,33 @@ def test_groupby_std(func):
 
     ddf = dask_cudf.from_cudf(gdf, npartitions=5)
 
-    a = func(gdf.to_pandas())
+    a = func(gdf).to_pandas()
     b = func(ddf).compute().to_pandas()
 
-    a.index.name = None
-    a.name = None
-    b.index.name = None
+    dd.assert_eq(a, b)
+
+
+@pytest.mark.parametrize(
+    "func",
+    [
+        lambda df: df.groupby("x").agg({"y": "collect"}),
+        lambda df: df.groupby("x").y.agg("collect"),
+    ],
+)
+def test_groupby_collect(func):
+    pdf = pd.DataFrame(
+        {
+            "x": np.random.randint(0, 5, size=10000),
+            "y": np.random.normal(size=10000),
+        }
+    )
+
+    gdf = cudf.DataFrame.from_pandas(pdf)
+
+    ddf = dask_cudf.from_cudf(gdf, npartitions=5)
+
+    a = func(gdf).to_pandas()
+    b = func(ddf).compute().to_pandas()
 
     dd.assert_eq(a, b)