diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 3366554db30..26d07515f70 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.02
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.04
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -37,7 +37,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.02
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.04
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -46,7 +46,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@branch-23.02
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@branch-23.04
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -55,7 +55,7 @@ jobs:
       skip_upload_pkgs: libcudf-example
   wheel-build-cudf:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-build.yml@branch-23.02
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-build.yml@branch-23.04
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -67,7 +67,7 @@ jobs:
   wheel-publish-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-publish.yml@branch-23.02
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-publish.yml@branch-23.04
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -77,7 +77,7 @@ jobs:
   wheel-build-dask-cudf:
     needs: wheel-publish-cudf
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-pure-build.yml@branch-23.02
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-pure-build.yml@branch-23.04
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -88,7 +88,7 @@ jobs:
   wheel-publish-dask-cudf:
     needs: wheel-build-dask-cudf
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-pure-publish.yml@branch-23.02
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-pure-publish.yml@branch-23.04
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index cf20b0006a2..f33fc15c52f 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -25,32 +25,32 @@ jobs:
       - wheel-build-dask-cudf
       - wheel-tests-dask-cudf
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@branch-23.02
+    uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@branch-23.04
   checks:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@branch-23.02
+    uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@branch-23.04
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.02
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.04
     with:
       build_type: pull-request
   conda-cpp-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.02
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.04
     with:
       build_type: pull-request
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.02
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.04
     with:
       build_type: pull-request
   conda-python-cudf-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.02
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.04
     with:
       build_type: pull-request
       test_script: "ci/test_python_cudf.sh"
@@ -58,14 +58,14 @@ jobs:
     # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.02
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.04
     with:
       build_type: pull-request
       test_script: "ci/test_python_other.sh"
   conda-java-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.02
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.04
     with:
       build_type: pull-request
       node_type: "gpu-latest-1"
@@ -75,7 +75,7 @@ jobs:
   conda-notebook-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.02
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.04
     with:
       build_type: pull-request
       node_type: "gpu-latest-1"
@@ -85,7 +85,7 @@ jobs:
   wheel-build-cudf:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-build.yml@branch-23.02
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-build.yml@branch-23.04
     with:
       build_type: pull-request
       package-name: cudf
@@ -94,7 +94,7 @@ jobs:
   wheel-tests-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-test.yml@branch-23.02
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-test.yml@branch-23.04
     with:
       build_type: pull-request
       package-name: cudf
@@ -106,7 +106,7 @@ jobs:
   wheel-build-dask-cudf:
     needs: wheel-tests-cudf
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-pure-build.yml@branch-23.02
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-pure-build.yml@branch-23.04
     with:
       build_type: pull-request
       package-name: dask_cudf
@@ -115,7 +115,7 @@ jobs:
   wheel-tests-dask-cudf:
     needs: wheel-build-dask-cudf
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-pure-test.yml@branch-23.02
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-pure-test.yml@branch-23.04
     with:
       build_type: pull-request
       package-name: dask_cudf
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 1b117bb2f4f..ff19d51f8ef 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.02
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.04
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -24,7 +24,7 @@ jobs:
       sha: ${{ inputs.sha }}
   conda-python-cudf-tests:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.02
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.04
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -34,7 +34,7 @@ jobs:
   conda-python-other-tests:
     # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.02
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.04
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -43,7 +43,7 @@ jobs:
       test_script: "ci/test_python_other.sh"
   conda-java-tests:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.02
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.04
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -55,7 +55,7 @@ jobs:
       run_script: "ci/test_java.sh"
   conda-notebook-tests:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.02
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.04
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -67,7 +67,7 @@ jobs:
       run_script: "ci/test_notebooks.sh"
   wheel-tests-cudf:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-test.yml@branch-23.02
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-test.yml@branch-23.04
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -78,7 +78,7 @@ jobs:
       test-unittest: "pytest -v -n 8 ./python/cudf/cudf/tests"
   wheel-tests-dask-cudf:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-pure-test.yml@branch-23.02
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-pure-test.yml@branch-23.04
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
diff --git a/cpp/benchmarks/string/repeat_strings.cpp b/cpp/benchmarks/string/repeat_strings.cpp
index 1844e93bc53..fe015b27f13 100644
--- a/cpp/benchmarks/string/repeat_strings.cpp
+++ b/cpp/benchmarks/string/repeat_strings.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -79,42 +79,6 @@ static void BM_repeat_strings_column_times(benchmark::State& state)
                           (strings_col.chars_size() + repeat_times_col.size() * sizeof(int32_t)));
 }
 
-static void BM_compute_output_strings_sizes(benchmark::State& state)
-{
-  auto const n_rows           = static_cast<cudf::size_type>(state.range(0));
-  auto const max_str_length   = static_cast<cudf::size_type>(state.range(1));
-  auto const table            = create_data_table(2, n_rows, max_str_length);
-  auto const strings_col      = cudf::strings_column_view(table->view().column(0));
-  auto const repeat_times_col = table->view().column(1);
-
-  for ([[maybe_unused]] auto _ : state) {
-    [[maybe_unused]] cuda_event_timer raii(state, true, cudf::get_default_stream());
-    cudf::strings::repeat_strings_output_sizes(strings_col, repeat_times_col);
-  }
-
-  state.SetBytesProcessed(state.iterations() *
-                          (strings_col.chars_size() + repeat_times_col.size() * sizeof(int32_t)));
-}
-
-static void BM_repeat_strings_column_times_precomputed_sizes(benchmark::State& state)
-{
-  auto const n_rows           = static_cast<cudf::size_type>(state.range(0));
-  auto const max_str_length   = static_cast<cudf::size_type>(state.range(1));
-  auto const table            = create_data_table(2, n_rows, max_str_length);
-  auto const strings_col      = cudf::strings_column_view(table->view().column(0));
-  auto const repeat_times_col = table->view().column(1);
-  [[maybe_unused]] auto const [sizes, total_bytes] =
-    cudf::strings::repeat_strings_output_sizes(strings_col, repeat_times_col);
-
-  for ([[maybe_unused]] auto _ : state) {
-    [[maybe_unused]] cuda_event_timer raii(state, true, cudf::get_default_stream());
-    cudf::strings::repeat_strings(strings_col, repeat_times_col, *sizes);
-  }
-
-  state.SetBytesProcessed(state.iterations() *
-                          (strings_col.chars_size() + repeat_times_col.size() * sizeof(int32_t)));
-}
-
 static void generate_bench_args(benchmark::internal::Benchmark* b)
 {
   int const min_rows   = 1 << 8;
@@ -145,23 +109,5 @@ class RepeatStrings : public cudf::benchmark {
     ->UseManualTime()                                               \
     ->Unit(benchmark::kMillisecond);
 
-#define COMPUTE_OUTPUT_STRINGS_SIZES_BENCHMARK_DEFINE(name)          \
-  BENCHMARK_DEFINE_F(RepeatStrings, name)                            \
-  (::benchmark::State & st) { BM_compute_output_strings_sizes(st); } \
-  BENCHMARK_REGISTER_F(RepeatStrings, name)                          \
-    ->Apply(generate_bench_args)                                     \
-    ->UseManualTime()                                                \
-    ->Unit(benchmark::kMillisecond);
-
-#define REPEAT_STRINGS_COLUMN_TIMES_PRECOMPUTED_SIZES_BENCHMARK_DEFINE(name)          \
-  BENCHMARK_DEFINE_F(RepeatStrings, name)                                             \
-  (::benchmark::State & st) { BM_repeat_strings_column_times_precomputed_sizes(st); } \
-  BENCHMARK_REGISTER_F(RepeatStrings, name)                                           \
-    ->Apply(generate_bench_args)                                                      \
-    ->UseManualTime()                                                                 \
-    ->Unit(benchmark::kMillisecond);
-
 REPEAT_STRINGS_SCALAR_TIMES_BENCHMARK_DEFINE(scalar_times)
 REPEAT_STRINGS_COLUMN_TIMES_BENCHMARK_DEFINE(column_times)
-COMPUTE_OUTPUT_STRINGS_SIZES_BENCHMARK_DEFINE(compute_output_strings_sizes)
-REPEAT_STRINGS_COLUMN_TIMES_PRECOMPUTED_SIZES_BENCHMARK_DEFINE(precomputed_sizes)
diff --git a/cpp/include/cudf/strings/repeat_strings.hpp b/cpp/include/cudf/strings/repeat_strings.hpp
index 0e6ee2126d3..26fe5f95983 100644
--- a/cpp/include/cudf/strings/repeat_strings.hpp
+++ b/cpp/include/cudf/strings/repeat_strings.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -32,15 +32,15 @@ namespace strings {
  */
 
 /**
- * @brief Repeat the given string scalar by a given number of times.
+ * @brief Repeat the given string scalar a given number of times
  *
  * An output string scalar is generated by repeating the input string by a number of times given by
- * the @p `repeat_times` parameter.
+ * the `repeat_times` parameter.
  *
  * In special cases:
- *  - If @p `repeat_times` is not a positive value, an empty (valid) string scalar will be returned.
+ *  - If `repeat_times` is not a positive value, an empty (valid) string scalar will be returned.
  *  - An invalid input scalar will always result in an invalid output scalar regardless of the
- *    value of @p `repeat_times` parameter.
+ *    value of `repeat_times` parameter.
  *
  * @code{.pseudo}
  * Example:
@@ -50,13 +50,13 @@ namespace strings {
  * @endcode
  *
  * @throw cudf::logic_error if the size of the output string scalar exceeds the maximum value that
- *        can be stored by the index type
- *        (i.e., @code input.size() * repeat_times > numeric_limits<size_type>::max() @endcode).
+ *        can be stored by the index type:
+ *        `input.size() * repeat_times > max of size_type`
  *
- * @param input The scalar containing the string to repeat.
- * @param repeat_times The number of times the input string is repeated.
- * @param mr Device memory resource used to allocate the returned string scalar.
- * @return New string scalar in which the input string is repeated.
+ * @param input The scalar containing the string to repeat
+ * @param repeat_times The number of times the input string is repeated
+ * @param mr Device memory resource used to allocate the returned string scalar
+ * @return New string scalar in which the input string is repeated
  */
 std::unique_ptr<string_scalar> repeat_string(
   string_scalar const& input,
@@ -64,19 +64,16 @@ std::unique_ptr<string_scalar> repeat_string(
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
- * @brief Repeat each string in the given strings column by a given number of times.
+ * @brief Repeat each string in the given strings column a given number of times
  *
- * An output strings column is generated by repeating each string from the input strings column by a
- * number of times given by the @p `repeat_times` parameter.
+ * An output strings column is generated by repeating each string from the input strings column by
+ * the number of times given by the `repeat_times` parameter.
  *
  * In special cases:
- *  - If @p `repeat_times` is not a positive number, a non-null input string will always result in
+ *  - If `repeat_times` is not a positive number, a non-null input string will always result in
  *    an empty output string.
  *  - A null input string will always result in a null output string regardless of the value of the
- *    @p `repeat_times` parameter.
- *
- * The caller is responsible for checking the output column size will not exceed the maximum size of
- * a strings column (number of total characters is less than the max size_type value).
+ *    `repeat_times` parameter.
  *
  * @code{.pseudo}
  * Example:
@@ -85,10 +82,10 @@ std::unique_ptr<string_scalar> repeat_string(
  * out is ['aaaaaa', null, '', 'bbcbbcbbc']
  * @endcode
  *
- * @param input The column containing strings to repeat.
- * @param repeat_times The number of times each input string is repeated.
- * @param mr Device memory resource used to allocate the returned strings column.
- * @return New column containing the repeated strings.
+ * @param input The column containing strings to repeat
+ * @param repeat_times The number of times each input string is repeated
+ * @param mr Device memory resource used to allocate the returned strings column
+ * @return New column containing the repeated strings
  */
 std::unique_ptr<column> repeat_strings(
   strings_column_view const& input,
@@ -97,11 +94,10 @@ std::unique_ptr<column> repeat_strings(
 
 /**
  * @brief Repeat each string in the given strings column by the numbers of times given in another
- * numeric column.
+ * numeric column
  *
  * An output strings column is generated by repeating each of the input string by a number of times
- * given by the corresponding row in a @p `repeat_times` numeric column. The computational time can
- * be reduced if sizes of the output strings are known and provided.
+ * given by the corresponding row in a `repeat_times` numeric column.
  *
  * In special cases:
  *  - Any null row (from either the input strings column or the `repeat_times` column) will always
@@ -109,9 +105,6 @@ std::unique_ptr<column> repeat_strings(
  *  - If any value in the `repeat_times` column is not a positive number and its corresponding input
  *    string is not null, the output string will be an empty string.
  *
- * The caller is responsible for checking the output column size will not exceed the maximum size of
- * a strings column (number of total characters is less than the max size_type value).
- *
  * @code{.pseudo}
  * Example:
  * strs         = ['aa', null, '', 'bbc-']
@@ -120,51 +113,16 @@ std::unique_ptr<column> repeat_strings(
  * out is ['aa', null, '', 'bbc-bbc-bbc-bbc-']
  * @endcode
  *
- * @throw cudf::logic_error if the input `repeat_times` column has data type other than integer.
+ * @throw cudf::logic_error if the input `repeat_times` is not an integer type
  * @throw cudf::logic_error if the input columns have different sizes.
  *
- * @param input The column containing strings to repeat.
+ * @param input The column containing strings to repeat
  * @param repeat_times The column containing numbers of times that the corresponding input strings
- *        are repeated.
- * @param output_strings_sizes The optional column containing pre-computed sizes of the output
- *        strings.
- * @param mr Device memory resource used to allocate the returned strings column.
+ *                     are repeated
+ * @param mr Device memory resource used to allocate the returned strings column
  * @return New column containing the repeated strings.
  */
 std::unique_ptr<column> repeat_strings(
-  strings_column_view const& input,
-  column_view const& repeat_times,
-  std::optional<column_view> output_strings_sizes = std::nullopt,
-  rmm::mr::device_memory_resource* mr             = rmm::mr::get_current_device_resource());
-
-/**
- * @brief Compute sizes of the output strings if each string in the input strings column
- * is repeated by the numbers of times given in another numeric column.
- *
- * The output column storing string output sizes is not nullable. These string sizes are
- * also summed up and returned (in an `int64_t` value), which can be used to detect if the input
- * strings column can be safely repeated without data corruption due to overflow in string indexing.
- *
- * @code{.pseudo}
- * Example:
- * strs         = ['aa', null, '', 'bbc-']
- * repeat_times = [ 1,   2,     3,  4   ]
- * [output_sizes, total_size] = repeat_strings_output_sizes(strs, repeat_times)
- * out is [2, 0, 0, 16], and total_size = 18
- * @endcode
- *
- * @throw cudf::logic_error if the input `repeat_times` column has data type other than integer.
- * @throw cudf::logic_error if the input columns have different sizes.
- *
- * @param input The column containing strings to repeat.
- * @param repeat_times The column containing numbers of times that the corresponding input strings
- *        are repeated.
- * @param mr Device memory resource used to allocate the returned strings column.
- * @return A pair with the first item is an int32_t column containing sizes of the output strings,
- *         and the second item is an int64_t number containing the total sizes (in bytes) of the
- *         output strings column.
- */
-std::pair<std::unique_ptr<column>, int64_t> repeat_strings_output_sizes(
   strings_column_view const& input,
   column_view const& repeat_times,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index 23d130e1585..ee115e7432a 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -104,20 +104,41 @@ struct page_state_s {
  * specified row bounds
  *
  * @param s The page to be checked
- * @param min_row The starting row index
+ * @param start_row The starting row index
  * @param num_rows The number of rows
  *
  * @return True if the page spans the beginning or the end of the row bounds
  */
-inline __device__ bool is_bounds_page(page_state_s* const s, size_t min_row, size_t num_rows)
+inline __device__ bool is_bounds_page(page_state_s* const s, size_t start_row, size_t num_rows)
 {
   size_t const page_begin = s->col.start_row + s->page.chunk_row;
   size_t const page_end   = page_begin + s->page.num_rows;
-  size_t const begin      = min_row;
-  size_t const end        = min_row + num_rows;
+  size_t const begin      = start_row;
+  size_t const end        = start_row + num_rows;
+
   return ((page_begin <= begin && page_end >= begin) || (page_begin <= end && page_end >= end));
 }
 
+/**
+ * @brief Returns whether or not a page is completely contained within the specified
+ * row bounds
+ *
+ * @param s The page to be checked
+ * @param start_row The starting row index
+ * @param num_rows The number of rows
+ *
+ * @return True if the page is completely contained within the row bounds
+ */
+inline __device__ bool is_page_contained(page_state_s* const s, size_t start_row, size_t num_rows)
+{
+  size_t const page_begin = s->col.start_row + s->page.chunk_row;
+  size_t const page_end   = page_begin + s->page.num_rows;
+  size_t const begin      = start_row;
+  size_t const end        = start_row + num_rows;
+
+  return page_begin >= begin && page_end <= end;
+}
+
 /**
  * @brief Read a 32-bit varint integer
  *
@@ -1728,10 +1749,11 @@ __global__ void __launch_bounds__(block_size)
       auto const thread_depth = depth + t;
       if (thread_depth < s->page.num_output_nesting_levels) {
         // if we are not a bounding page (as checked above) then we are either
-        // returning 0 rows from the page (completely outside the bounds) or all
-        // rows in the page (completely within the bounds)
+        // returning all rows/values from this page, or 0 of them
         pp->nesting[thread_depth].batch_size =
-          s->num_rows == 0 ? 0 : pp->nesting[thread_depth].size;
+          (s->num_rows == 0 && !is_page_contained(s, min_row, num_rows))
+            ? 0
+            : pp->nesting[thread_depth].size;
       }
       depth += blockDim.x;
     }
@@ -1838,7 +1860,19 @@ __global__ void __launch_bounds__(block_size) gpuDecodePageData(
   bool const has_repetition = s->col.max_level[level_type::REPETITION] > 0;
 
   // if we have no work to do (eg, in a skip_rows/num_rows case) in this page.
-  if (s->num_rows == 0 && !(has_repetition && is_bounds_page(s, min_row, num_rows))) { return; }
+  //
+  // corner case: in the case of lists, we can have pages that contain "0" rows if the current row
+  // starts before this page and ends after this page:
+  //       P0        P1        P2
+  //  |---------|---------|----------|
+  //        ^------------------^
+  //      row start           row end
+  // P1 will contain 0 rows
+  //
+  if (s->num_rows == 0 && !(has_repetition && (is_bounds_page(s, min_row, num_rows) ||
+                                               is_page_contained(s, min_row, num_rows)))) {
+    return;
+  }
 
   if (s->dict_base) {
     out_thread0 = (s->dict_bits > 0) ? 64 : 32;
diff --git a/cpp/src/strings/repeat_strings.cu b/cpp/src/strings/repeat_strings.cu
index cc283fbcee2..3784b535a5b 100644
--- a/cpp/src/strings/repeat_strings.cu
+++ b/cpp/src/strings/repeat_strings.cu
@@ -176,7 +176,7 @@ namespace {
  * separate number of times.
  */
 template <class Iterator>
-struct compute_size_and_repeat_separately_fn {
+struct compute_sizes_and_repeat_fn {
   column_device_view const strings_dv;
   column_device_view const repeat_times_dv;
   Iterator const repeat_times_iter;
@@ -189,146 +189,63 @@ struct compute_size_and_repeat_separately_fn {
   // If d_chars != nullptr: only repeat strings.
   char* d_chars{nullptr};
 
-  __device__ int64_t operator()(size_type const idx) const noexcept
+  __device__ void operator()(size_type const idx) const noexcept
   {
     auto const string_is_valid = !strings_has_nulls || strings_dv.is_valid_nocheck(idx);
     auto const rtimes_is_valid = !rtimes_has_nulls || repeat_times_dv.is_valid_nocheck(idx);
 
     // Any null input (either string or repeat_times value) will result in a null output.
     auto const is_valid = string_is_valid && rtimes_is_valid;
+    if (!is_valid) {
+      if (!d_chars) { d_offsets[idx] = 0; }
+      return;
+    }
 
-    // When the input string is null, `repeat_times` and `string_size` are also set to 0.
-    // This makes sure that if `repeat_times > 0` then we will always have a valid input string,
-    // and if `repeat_times <= 0` we will never copy anything to the output.
-    auto const repeat_times = is_valid ? repeat_times_iter[idx] : size_type{0};
-    auto const string_size =
-      is_valid ? strings_dv.element<string_view>(idx).size_bytes() : size_type{0};
-
-    // The output_size is returned, and it needs to be an int64_t number to prevent overflow.
-    auto const output_size =
-      repeat_times > 0 ? static_cast<int64_t>(repeat_times) * static_cast<int64_t>(string_size)
-                       : int64_t{0};
+    auto repeat_times = repeat_times_iter[idx];
+    auto const d_str  = strings_dv.element<string_view>(idx);
 
     if (!d_chars) {
-      // If overflow happen, the stored value of output string size will be incorrect due to
-      // downcasting. In such cases, the entire output string size array should be discarded.
-      d_offsets[idx] = static_cast<offset_type>(output_size);
-    } else if (repeat_times > 0 && string_size > 0) {
-      auto const d_str     = strings_dv.element<string_view>(idx);
-      auto const input_ptr = d_str.data();
-      auto output_ptr      = d_chars + d_offsets[idx];
-      for (size_type repeat_idx = 0; repeat_idx < repeat_times; ++repeat_idx) {
-        output_ptr = copy_and_increment(output_ptr, input_ptr, string_size);
+      // repeat_times could be negative
+      d_offsets[idx] = (repeat_times > 0) ? (repeat_times * d_str.size_bytes()) : 0;
+    } else {
+      auto output_ptr = d_chars + d_offsets[idx];
+      while (repeat_times-- > 0) {
+        output_ptr = copy_and_increment(output_ptr, d_str.data(), d_str.size_bytes());
       }
     }
-
-    // The output_size value may be used to sum up to detect overflow at the caller site.
-    // The caller can detect overflow easily by checking `SUM(output_size) > INT_MAX`.
-    return output_size;
   }
 };
 
-/**
- * @brief Creates child offsets and chars columns by applying the template function that
- * can be used for computing the output size of each string as well as create the output.
- *
- * This function is similar to `strings::detail::make_strings_children`, except that it accepts an
- * optional input `std::optional<column_view>` that can contain the precomputed sizes of the output
- * strings.
- *
- * @deprecated This will be removed with issue 12542
- */
-template <typename Func>
-auto make_strings_children(Func fn,
-                           size_type exec_size,
-                           size_type strings_count,
-                           std::optional<column_view> output_strings_sizes,
-                           rmm::cuda_stream_view stream,
-                           rmm::mr::device_memory_resource* mr)
-{
-  auto offsets_column = make_numeric_column(
-    data_type{type_id::INT32}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
-
-  auto offsets_view = offsets_column->mutable_view();
-  auto d_offsets    = offsets_view.template data<size_type>();
-  fn.d_offsets      = d_offsets;
-
-  // This may be called twice -- once for offsets and once for chars.
-  auto for_each_fn = [exec_size, stream](Func& fn) {
-    thrust::for_each_n(
-      rmm::exec_policy(stream), thrust::make_counting_iterator<size_type>(0), exec_size, fn);
-  };
-
-  if (!output_strings_sizes.has_value()) {
-    // Compute the output sizes only if they are not given.
-    for_each_fn(fn);
-
-    // Compute the offsets values.
-    auto const bytes =
-      cudf::detail::sizes_to_offsets(d_offsets, d_offsets + strings_count + 1, d_offsets, stream);
-    CUDF_EXPECTS(bytes <= static_cast<int64_t>(std::numeric_limits<size_type>::max()),
-                 "Size of output exceeds column size limit");
-  } else {
-    // Compute the offsets values from the provided output string sizes.
-    auto const string_sizes = output_strings_sizes.value();
-    CUDF_CUDA_TRY(cudaMemsetAsync(d_offsets, 0, sizeof(offset_type), stream.value()));
-    thrust::inclusive_scan(rmm::exec_policy(stream),
-                           string_sizes.template begin<size_type>(),
-                           string_sizes.template end<size_type>(),
-                           d_offsets + 1);
-  }
-
-  // Now build the chars column
-  auto const bytes  = cudf::detail::get_value<size_type>(offsets_view, strings_count, stream);
-  auto chars_column = create_chars_child_column(bytes, stream, mr);
-
-  // Execute the function fn again to fill the chars column.
-  // Note that if the output chars column has zero size, the function fn should not be called to
-  // avoid accidentally overwriting the offsets.
-  if (bytes > 0) {
-    fn.d_chars = chars_column->mutable_view().template data<char>();
-    for_each_fn(fn);
-  }
-
-  return std::pair(std::move(offsets_column), std::move(chars_column));
-}
-
 }  // namespace
 
 std::unique_ptr<column> repeat_strings(strings_column_view const& input,
                                        column_view const& repeat_times,
-                                       std::optional<column_view> output_strings_sizes,
                                        rmm::cuda_stream_view stream,
                                        rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(input.size() == repeat_times.size(), "The input columns must have the same size.");
   CUDF_EXPECTS(cudf::is_index_type(repeat_times.type()),
                "repeat_strings expects an integer type for the `repeat_times` input column.");
-  if (output_strings_sizes.has_value()) {
-    auto const output_sizes = output_strings_sizes.value();
-    CUDF_EXPECTS(input.size() == output_sizes.size() &&
-                   (!output_sizes.nullable() || !output_sizes.has_nulls()),
-                 "The given column of output string sizes is invalid.");
-  }
 
   auto const strings_count = input.size();
   if (strings_count == 0) { return make_empty_column(type_id::STRING); }
 
   auto const strings_dv_ptr      = column_device_view::create(input.parent(), stream);
   auto const repeat_times_dv_ptr = column_device_view::create(repeat_times, stream);
-  auto const strings_has_nulls   = input.has_nulls();
-  auto const rtimes_has_nulls    = repeat_times.has_nulls();
   auto const repeat_times_iter =
     cudf::detail::indexalator_factory::make_input_iterator(repeat_times);
-  auto const fn = compute_size_and_repeat_separately_fn<decltype(repeat_times_iter)>{
-    *strings_dv_ptr, *repeat_times_dv_ptr, repeat_times_iter, strings_has_nulls, rtimes_has_nulls};
-
-  auto [offsets_column, chars_column] =
-    make_strings_children(fn, strings_count, strings_count, output_strings_sizes, stream, mr);
-
-  // We generate new bitmask by AND of the input columns' bitmasks.
-  // Note that if the input columns are nullable, the output column will also be nullable (which may
-  // not have nulls).
+  auto const fn =
+    compute_sizes_and_repeat_fn<decltype(repeat_times_iter)>{*strings_dv_ptr,
+                                                             *repeat_times_dv_ptr,
+                                                             repeat_times_iter,
+                                                             input.has_nulls(),
+                                                             repeat_times.has_nulls()};
+
+  auto [offsets_column, chars_column] = make_strings_children(fn, strings_count, stream, mr);
+
+  // We generate new bitmask by AND of the two input columns' bitmasks.
+  // Note that if either of the input columns are nullable, the output column will also be nullable
+  // but may not have nulls.
   auto [null_mask, null_count] =
     cudf::detail::bitmask_and(table_view{{input.parent(), repeat_times}}, stream, mr);
 
@@ -338,52 +255,6 @@ std::unique_ptr<column> repeat_strings(strings_column_view const& input,
                              null_count,
                              std::move(null_mask));
 }
-
-std::pair<std::unique_ptr<column>, int64_t> repeat_strings_output_sizes(
-  strings_column_view const& input,
-  column_view const& repeat_times,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
-{
-  CUDF_EXPECTS(input.size() == repeat_times.size(), "The input columns must have the same size.");
-  CUDF_EXPECTS(
-    cudf::is_index_type(repeat_times.type()),
-    "repeat_strings_output_sizes expects an integer type for the `repeat_times` input column.");
-
-  auto const strings_count = input.size();
-  if (strings_count == 0) {
-    return std::pair(make_empty_column(type_to_id<size_type>()), int64_t{0});
-  }
-
-  auto output_sizes = make_numeric_column(
-    data_type{type_to_id<size_type>()}, strings_count, mask_state::UNALLOCATED, stream, mr);
-
-  auto const strings_dv_ptr      = column_device_view::create(input.parent(), stream);
-  auto const repeat_times_dv_ptr = column_device_view::create(repeat_times, stream);
-  auto const strings_has_nulls   = input.has_nulls();
-  auto const rtimes_has_nulls    = repeat_times.has_nulls();
-  auto const repeat_times_iter =
-    cudf::detail::indexalator_factory::make_input_iterator(repeat_times);
-
-  auto const fn = compute_size_and_repeat_separately_fn<decltype(repeat_times_iter)>{
-    *strings_dv_ptr,
-    *repeat_times_dv_ptr,
-    repeat_times_iter,
-    strings_has_nulls,
-    rtimes_has_nulls,
-    output_sizes->mutable_view().template begin<size_type>()};
-
-  auto const total_bytes =
-    thrust::transform_reduce(rmm::exec_policy(stream),
-                             thrust::make_counting_iterator<size_type>(0),
-                             thrust::make_counting_iterator<size_type>(strings_count),
-                             fn,
-                             int64_t{0},
-                             thrust::plus{});
-
-  return std::pair(std::move(output_sizes), total_bytes);
-}
-
 }  // namespace detail
 
 std::unique_ptr<string_scalar> repeat_string(string_scalar const& input,
@@ -404,21 +275,10 @@ std::unique_ptr<column> repeat_strings(strings_column_view const& input,
 
 std::unique_ptr<column> repeat_strings(strings_column_view const& input,
                                        column_view const& repeat_times,
-                                       std::optional<column_view> output_strings_sizes,
                                        rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::repeat_strings(
-    input, repeat_times, output_strings_sizes, cudf::get_default_stream(), mr);
-}
-
-std::pair<std::unique_ptr<column>, int64_t> repeat_strings_output_sizes(
-  strings_column_view const& input,
-  column_view const& repeat_times,
-  rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::repeat_strings_output_sizes(input, repeat_times, cudf::get_default_stream(), mr);
+  return detail::repeat_strings(input, repeat_times, cudf::get_default_stream(), mr);
 }
 
 }  // namespace strings
diff --git a/cpp/tests/strings/repeat_strings_tests.cpp b/cpp/tests/strings/repeat_strings_tests.cpp
index 69d0494c253..e75409d9f39 100644
--- a/cpp/tests/strings/repeat_strings_tests.cpp
+++ b/cpp/tests/strings/repeat_strings_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -207,20 +207,6 @@ TEST_F(RepeatStringsTest, StringsColumnWithColumnRepeatTimesInvalidInput)
     EXPECT_THROW(cudf::strings::repeat_strings(strs_cv, repeat_times), cudf::logic_error);
   }
 
-  // Sizes mismatched between strings column and output_strings_sizes column.
-  {
-    auto const repeat_times = int32s_col{1, 2};
-    auto const sizes        = int32s_col{1, 2, 3, 4, 5};
-    EXPECT_THROW(cudf::strings::repeat_strings(strs_cv, repeat_times, sizes), cudf::logic_error);
-  }
-
-  // output_strings_sizes column has nulls.
-  {
-    auto const repeat_times = int32s_col{1, 2};
-    auto const sizes        = int32s_col{{null, 2}, null_at(0)};
-    EXPECT_THROW(cudf::strings::repeat_strings(strs_cv, repeat_times, sizes), cudf::logic_error);
-  }
-
   // Invalid data type for repeat_times column.
   {
     auto const repeat_times = cudf::test::fixed_width_column_wrapper<float>{1, 2, 3, 4, 5, 6};
@@ -243,11 +229,7 @@ TEST_F(RepeatStringsTest, StringsColumnWithColumnRepeatTimesOverflowOutput)
   auto const repeat_times =
     int32s_col{half_max, half_max, half_max, half_max, half_max, half_max, half_max};
 
-  auto const [sizes, total_bytes] =
-    cudf::strings::repeat_strings_output_sizes(strs_cv, repeat_times);
-  (void)sizes;
-  auto const expected_bytes = static_cast<int64_t>(half_max) * int64_t{1 + 2 + 3 + 4 + 5 + 6 + 7};
-  EXPECT_EQ(expected_bytes, total_bytes);
+  EXPECT_THROW(cudf::strings::repeat_strings(strs_cv, repeat_times), cudf::logic_error);
 }
 
 TYPED_TEST(RepeatStringsTypedTest, StringsColumnNoNullWithScalarRepeatTimes)
@@ -301,15 +283,6 @@ TYPED_TEST(RepeatStringsTypedTest, StringsColumnNoNullWithColumnRepeatTimes)
 
     auto results = cudf::strings::repeat_strings(strs_cv, repeat_times);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity);
-
-    auto const expected_sizes = int32s_col{6, 12, 27, 0, 0};
-    auto const [sizes, total_bytes] =
-      cudf::strings::repeat_strings_output_sizes(strs_cv, repeat_times);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_sizes, *sizes, verbosity);
-    EXPECT_EQ(45, total_bytes);
-
-    results = cudf::strings::repeat_strings(strs_cv, repeat_times, *sizes);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity);
   }
 
   // repeat_times column has nulls.
@@ -320,15 +293,6 @@ TYPED_TEST(RepeatStringsTypedTest, StringsColumnNoNullWithColumnRepeatTimes)
 
     auto results = cudf::strings::repeat_strings(strs_cv, repeat_times);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity);
-
-    auto const expected_sizes = int32s_col{6, 0, 27, 12, 0};
-    auto const [sizes, total_bytes] =
-      cudf::strings::repeat_strings_output_sizes(strs_cv, repeat_times);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_sizes, *sizes, verbosity);
-    EXPECT_EQ(45, total_bytes);
-
-    results = cudf::strings::repeat_strings(strs_cv, repeat_times, *sizes);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity);
   }
 }
 
@@ -377,15 +341,6 @@ TYPED_TEST(RepeatStringsTypedTest, SlicedStringsColumnNoNullWithColumnRepeatTime
 
     auto results = cudf::strings::repeat_strings(sliced_strs_cv, sliced_rtimes);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity);
-
-    auto const expected_sizes = int32s_col{6, 12, 27};
-    auto const [sizes, total_bytes] =
-      cudf::strings::repeat_strings_output_sizes(sliced_strs_cv, sliced_rtimes);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_sizes, *sizes, verbosity);
-    EXPECT_EQ(45, total_bytes);
-
-    results = cudf::strings::repeat_strings(sliced_strs_cv, sliced_rtimes, *sizes);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity);
   }
 
   // Sliced the middle of the column.
@@ -397,15 +352,6 @@ TYPED_TEST(RepeatStringsTypedTest, SlicedStringsColumnNoNullWithColumnRepeatTime
 
     auto results = cudf::strings::repeat_strings(sliced_strs_cv, sliced_rtimes);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity);
-
-    auto const expected_sizes = int32s_col{12, 27};
-    auto const [sizes, total_bytes] =
-      cudf::strings::repeat_strings_output_sizes(sliced_strs_cv, sliced_rtimes);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_sizes, *sizes, verbosity);
-    EXPECT_EQ(39, total_bytes);
-
-    results = cudf::strings::repeat_strings(sliced_strs_cv, sliced_rtimes, *sizes);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity);
   }
 
   // Sliced the second half of the column.
@@ -417,15 +363,6 @@ TYPED_TEST(RepeatStringsTypedTest, SlicedStringsColumnNoNullWithColumnRepeatTime
 
     auto results = cudf::strings::repeat_strings(sliced_strs_cv, sliced_rtimes);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity);
-
-    auto const expected_sizes = int32s_col{27, 12, 12};
-    auto const [sizes, total_bytes] =
-      cudf::strings::repeat_strings_output_sizes(sliced_strs_cv, sliced_rtimes);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_sizes, *sizes, verbosity);
-    EXPECT_EQ(51, total_bytes);
-
-    results = cudf::strings::repeat_strings(sliced_strs_cv, sliced_rtimes, *sizes);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity);
   }
 }
 
@@ -520,15 +457,6 @@ TYPED_TEST(RepeatStringsTypedTest, StringsColumnWithNullsWithColumnRepeatTimes)
 
     auto results = cudf::strings::repeat_strings(strs_cv, repeat_times);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity);
-
-    auto const expected_sizes = int32s_col{6, 0, 18, 0, 0, 0, 12, 12, 0, 0};
-    auto const [sizes, total_bytes] =
-      cudf::strings::repeat_strings_output_sizes(strs_cv, repeat_times);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_sizes, *sizes, verbosity);
-    EXPECT_EQ(48, total_bytes);
-
-    results = cudf::strings::repeat_strings(strs_cv, repeat_times, *sizes);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity);
   }
 
   // repeat_times column has nulls.
@@ -549,15 +477,6 @@ TYPED_TEST(RepeatStringsTypedTest, StringsColumnWithNullsWithColumnRepeatTimes)
 
     auto results = cudf::strings::repeat_strings(strs_cv, repeat_times);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity);
-
-    auto const expected_sizes = int32s_col{6, 0, 0, 0, 0, 0, 12, 0, 0, 0};
-    auto const [sizes, total_bytes] =
-      cudf::strings::repeat_strings_output_sizes(strs_cv, repeat_times);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_sizes, *sizes, verbosity);
-    EXPECT_EQ(18, total_bytes);
-
-    results = cudf::strings::repeat_strings(strs_cv, repeat_times, *sizes);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity);
   }
 }
 
@@ -631,15 +550,6 @@ TYPED_TEST(RepeatStringsTypedTest, SlicedStringsColumnWithNullsWithColumnRepeatT
 
     auto results = cudf::strings::repeat_strings(sliced_strs_cv, sliced_rtimes);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity);
-
-    auto const expected_sizes = int32s_col{6, 0, 0};
-    auto const [sizes, total_bytes] =
-      cudf::strings::repeat_strings_output_sizes(sliced_strs_cv, sliced_rtimes);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_sizes, *sizes, verbosity);
-    EXPECT_EQ(6, total_bytes);
-
-    results = cudf::strings::repeat_strings(sliced_strs_cv, sliced_rtimes, *sizes);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity);
   }
 
   // Sliced the middle of the column.
@@ -652,15 +562,6 @@ TYPED_TEST(RepeatStringsTypedTest, SlicedStringsColumnWithNullsWithColumnRepeatT
 
     auto results = cudf::strings::repeat_strings(sliced_strs_cv, sliced_rtimes);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity);
-
-    auto const expected_sizes = int32s_col{0, 0, 0, 0, 12};
-    auto const [sizes, total_bytes] =
-      cudf::strings::repeat_strings_output_sizes(sliced_strs_cv, sliced_rtimes);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_sizes, *sizes, verbosity);
-    EXPECT_EQ(12, total_bytes);
-
-    results = cudf::strings::repeat_strings(sliced_strs_cv, sliced_rtimes, *sizes);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity);
   }
 
   // Sliced the second half of the column, output has nulls.
@@ -672,15 +573,6 @@ TYPED_TEST(RepeatStringsTypedTest, SlicedStringsColumnWithNullsWithColumnRepeatT
 
     auto results = cudf::strings::repeat_strings(sliced_strs_cv, sliced_rtimes);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity);
-
-    auto const expected_sizes = int32s_col{12, 0, 0, 0};
-    auto const [sizes, total_bytes] =
-      cudf::strings::repeat_strings_output_sizes(sliced_strs_cv, sliced_rtimes);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_sizes, *sizes, verbosity);
-    EXPECT_EQ(12, total_bytes);
-
-    results = cudf::strings::repeat_strings(sliced_strs_cv, sliced_rtimes, *sizes);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity);
   }
 
   // Sliced the second half of the column, output does not have null.
@@ -693,14 +585,5 @@ TYPED_TEST(RepeatStringsTypedTest, SlicedStringsColumnWithNullsWithColumnRepeatT
 
     auto results = cudf::strings::repeat_strings(sliced_strs_cv, sliced_rtimes);
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_strs, *results, verbosity);
-
-    auto const expected_sizes = int32s_col{0, 0};
-    auto const [sizes, total_bytes] =
-      cudf::strings::repeat_strings_output_sizes(sliced_strs_cv, sliced_rtimes);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_sizes, *sizes, verbosity);
-    EXPECT_EQ(0, total_bytes);
-
-    results = cudf::strings::repeat_strings(sliced_strs_cv, sliced_rtimes, *sizes);
-    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_strs, *results, verbosity);
   }
 }
diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java
index 2d0bf28225f..0cb9ed37d9f 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnView.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java
@@ -2922,8 +2922,21 @@ public final ColumnVector stringReplace(Scalar target, Scalar replace) {
    * @param repl The string scalar to replace for each pattern match.
    * @return A new column vector containing the string results.
    */
+  @Deprecated
   public final ColumnVector replaceRegex(String pattern, Scalar repl) {
-    return replaceRegex(pattern, repl, -1);
+    return replaceRegex(new RegexProgram(pattern, CaptureGroups.NON_CAPTURE), repl);
+  }
+
+  /**
+   * For each string, replaces any character sequence matching the given regex program pattern
+   * using the replacement string scalar.
+   *
+   * @param regexProg The regex program with pattern to search within each string.
+   * @param repl The string scalar to replace for each pattern match.
+   * @return A new column vector containing the string results.
+   */
+  public final ColumnVector replaceRegex(RegexProgram regexProg, Scalar repl) {
+    return replaceRegex(regexProg, repl, -1);
   }
 
   /**
@@ -2935,12 +2948,27 @@ public final ColumnVector replaceRegex(String pattern, Scalar repl) {
    * @param maxRepl The maximum number of times a replacement should occur within each string.
    * @return A new column vector containing the string results.
    */
+  @Deprecated
   public final ColumnVector replaceRegex(String pattern, Scalar repl, int maxRepl) {
+    return replaceRegex(new RegexProgram(pattern, CaptureGroups.NON_CAPTURE), repl, maxRepl);
+  }
+
+  /**
+   * For each string, replaces any character sequence matching the given regex program pattern
+   * using the replacement string scalar.
+   *
+   * @param regexProg The regex program with pattern to search within each string.
+   * @param repl The string scalar to replace for each pattern match.
+   * @param maxRepl The maximum number of times a replacement should occur within each string.
+   * @return A new column vector containing the string results.
+   */
+  public final ColumnVector replaceRegex(RegexProgram regexProg, Scalar repl, int maxRepl) {
     if (!repl.getType().equals(DType.STRING)) {
       throw new IllegalArgumentException("Replacement must be a string scalar");
     }
-    return new ColumnVector(replaceRegex(getNativeView(), pattern, repl.getScalarHandle(),
-        maxRepl));
+    assert regexProg != null : "regex program may not be null";
+    return new ColumnVector(replaceRegex(getNativeView(), regexProg.pattern(), regexProg.combinedFlags(),
+                                         regexProg.capture().nativeId, repl.getScalarHandle(), maxRepl));
   }
 
   /**
@@ -2966,9 +2994,26 @@ public final ColumnVector replaceMultiRegex(String[] patterns, ColumnView repls)
    * @param replace The replacement template for creating the output string.
    * @return A new java column vector containing the string results.
    */
+  @Deprecated
   public final ColumnVector stringReplaceWithBackrefs(String pattern, String replace) {
-    return new ColumnVector(stringReplaceWithBackrefs(getNativeView(), pattern,
-        replace));
+    return stringReplaceWithBackrefs(new RegexProgram(pattern), replace);
+  }
+
+  /**
+   * For each string, replaces any character sequence matching the given regex program
+   * pattern using the replace template for back-references.
+   *
+   * Any null string entries return corresponding null output column entries.
+   *
+   * @param regexProg The regex program with pattern to search within each string.
+   * @param replace The replacement template for creating the output string.
+   * @return A new java column vector containing the string results.
+   */
+  public final ColumnVector stringReplaceWithBackrefs(RegexProgram regexProg, String replace) {
+    assert regexProg != null : "regex program may not be null";
+    return new ColumnVector(
+        stringReplaceWithBackrefs(getNativeView(), regexProg.pattern(), regexProg.combinedFlags(),
+                                  regexProg.capture().nativeId, replace));
   }
 
   /**
@@ -4129,12 +4174,14 @@ private static native long substringColumn(long columnView, long startColumn, lo
    * Native method for replacing each regular expression pattern match with the specified
    * replacement string.
    * @param columnView native handle of the cudf::column_view being operated on.
-   * @param pattern The regular expression pattern to search within each string.
+   * @param pattern regular expression pattern to search within each string.
+   * @param flags regex flags setting.
+   * @param capture capture groups setting.
    * @param repl native handle of the cudf::scalar containing the replacement string.
    * @param maxRepl maximum number of times to replace the pattern within a string
    * @return native handle of the resulting cudf column containing the string results.
    */
-  private static native long replaceRegex(long columnView, String pattern,
+  private static native long replaceRegex(long columnView, String pattern, int flags, int capture,
                                           long repl, long maxRepl) throws CudfException;
 
   /**
@@ -4148,15 +4195,17 @@ private static native long replaceMultiRegex(long columnView, String[] patterns,
                                                long repls) throws CudfException;
 
   /**
-   * Native method for replacing any character sequence matching the given pattern
-   * using the replace template for back-references.
+   * Native method for replacing any character sequence matching the given regex program
+   * pattern using the replace template for back-references.
    * @param columnView native handle of the cudf::column_view being operated on.
    * @param pattern The regular expression patterns to search within each string.
+   * @param flags Regex flags setting.
+   * @param capture Capture groups setting.
    * @param replace The replacement template for creating the output string.
    * @return native handle of the resulting cudf column containing the string results.
    */
-  private static native long stringReplaceWithBackrefs(long columnView, String pattern,
-                                                       String replace) throws CudfException;
+  private static native long stringReplaceWithBackrefs(long columnView, String pattern, int flags,
+                                                       int capture, String replace) throws CudfException;
 
   /**
    * Native method for checking if strings in a column starts with a specified comparison string.
diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp
index 958efd364ed..c42cc430560 100644
--- a/java/src/main/native/src/ColumnViewJni.cpp
+++ b/java/src/main/native/src/ColumnViewJni.cpp
@@ -1606,21 +1606,24 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_mapContains(JNIEnv *env,
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_replaceRegex(JNIEnv *env, jclass,
-                                                                    jlong j_column_view,
-                                                                    jstring j_pattern, jlong j_repl,
-                                                                    jlong j_maxrepl) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_replaceRegex(
+    JNIEnv *env, jclass, jlong j_column_view, jstring j_pattern, jint regex_flags,
+    jint capture_groups, jlong j_repl, jlong j_maxrepl) {
 
   JNI_NULL_CHECK(env, j_column_view, "column is null", 0);
   JNI_NULL_CHECK(env, j_pattern, "pattern string is null", 0);
   JNI_NULL_CHECK(env, j_repl, "replace scalar is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto cv = reinterpret_cast<cudf::column_view const *>(j_column_view);
-    cudf::strings_column_view scv(*cv);
-    cudf::jni::native_jstring pattern(env, j_pattern);
-    auto repl = reinterpret_cast<cudf::string_scalar const *>(j_repl);
-    return release_as_jlong(cudf::strings::replace_re(scv, pattern.get(), *repl, j_maxrepl));
+    auto const cv = reinterpret_cast<cudf::column_view const *>(j_column_view);
+    auto const strings_column = cudf::strings_column_view{*cv};
+    auto const pattern = cudf::jni::native_jstring(env, j_pattern);
+    auto const flags = static_cast<cudf::strings::regex_flags>(regex_flags);
+    auto const groups = static_cast<cudf::strings::capture_groups>(capture_groups);
+    auto const regex_prog = cudf::strings::regex_program::create(pattern.get(), flags, groups);
+    auto const repl = reinterpret_cast<cudf::string_scalar const *>(j_repl);
+    return release_as_jlong(
+        cudf::strings::replace_re(strings_column, *regex_prog, *repl, j_maxrepl));
   }
   CATCH_STD(env, 0);
 }
@@ -1646,19 +1649,23 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_replaceMultiRegex(JNIEnv
 }
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringReplaceWithBackrefs(
-    JNIEnv *env, jclass, jlong column_view, jstring patternObj, jstring replaceObj) {
+    JNIEnv *env, jclass, jlong j_column_view, jstring pattern_obj, jint regex_flags,
+    jint capture_groups, jstring replace_obj) {
 
-  JNI_NULL_CHECK(env, column_view, "column is null", 0);
-  JNI_NULL_CHECK(env, patternObj, "pattern string is null", 0);
-  JNI_NULL_CHECK(env, replaceObj, "replace string is null", 0);
+  JNI_NULL_CHECK(env, j_column_view, "column is null", 0);
+  JNI_NULL_CHECK(env, pattern_obj, "pattern string is null", 0);
+  JNI_NULL_CHECK(env, replace_obj, "replace string is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *cv = reinterpret_cast<cudf::column_view *>(column_view);
-    cudf::strings_column_view scv(*cv);
-    cudf::jni::native_jstring ss_pattern(env, patternObj);
-    cudf::jni::native_jstring ss_replace(env, replaceObj);
+    auto const cv = reinterpret_cast<cudf::column_view const *>(j_column_view);
+    auto const strings_column = cudf::strings_column_view{*cv};
+    auto const pattern = cudf::jni::native_jstring(env, pattern_obj);
+    auto const flags = static_cast<cudf::strings::regex_flags>(regex_flags);
+    auto const groups = static_cast<cudf::strings::capture_groups>(capture_groups);
+    auto const regex_prog = cudf::strings::regex_program::create(pattern.get(), flags, groups);
+    cudf::jni::native_jstring ss_replace(env, replace_obj);
     return release_as_jlong(
-        cudf::strings::replace_with_backrefs(scv, ss_pattern.get(), ss_replace.get()));
+        cudf::strings::replace_with_backrefs(strings_column, *regex_prog, ss_replace.get()));
   }
   CATCH_STD(env, 0);
 }
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index ab4baf74277..db64dcb08c7 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -5147,29 +5147,42 @@ void teststringReplaceThrowsException() {
 
   @Test
   void testReplaceRegex() {
-    try (ColumnVector v =
-             ColumnVector.fromStrings("title and Title with title", "nothing", null, "Title");
-         Scalar repl = Scalar.fromString("Repl");
-         ColumnVector actual = v.replaceRegex("[tT]itle", repl);
-         ColumnVector expected =
-             ColumnVector.fromStrings("Repl and Repl with Repl", "nothing", null, "Repl")) {
-      assertColumnsAreEqual(expected, actual);
-    }
+    try (ColumnVector v = ColumnVector.fromStrings("title and Title with title", "nothing", null, "Title");
+         Scalar repl = Scalar.fromString("Repl")) {
+      String pattern = "[tT]itle";
+      RegexProgram regexProg = new RegexProgram(pattern, CaptureGroups.NON_CAPTURE);
 
-    try (ColumnVector v =
-             ColumnVector.fromStrings("title and Title with title", "nothing", null, "Title");
-         Scalar repl = Scalar.fromString("Repl");
-         ColumnVector actual = v.replaceRegex("[tT]itle", repl, 0)) {
-      assertColumnsAreEqual(v, actual);
-    }
+      try (ColumnVector actual = v.replaceRegex(pattern, repl);
+           ColumnVector expected =
+               ColumnVector.fromStrings("Repl and Repl with Repl", "nothing", null, "Repl")) {
+        assertColumnsAreEqual(expected, actual);
+      }
 
-    try (ColumnVector v =
-             ColumnVector.fromStrings("title and Title with title", "nothing", null, "Title");
-         Scalar repl = Scalar.fromString("Repl");
-         ColumnVector actual = v.replaceRegex("[tT]itle", repl, 1);
-         ColumnVector expected =
-             ColumnVector.fromStrings("Repl and Title with title", "nothing", null, "Repl")) {
-      assertColumnsAreEqual(expected, actual);
+      try (ColumnVector actual = v.replaceRegex(pattern, repl, 0)) {
+        assertColumnsAreEqual(v, actual);
+      }
+
+      try (ColumnVector actual = v.replaceRegex(pattern, repl, 1);
+           ColumnVector expected =
+               ColumnVector.fromStrings("Repl and Title with title", "nothing", null, "Repl")) {
+        assertColumnsAreEqual(expected, actual);
+      }
+
+      try (ColumnVector actual = v.replaceRegex(regexProg, repl);
+           ColumnVector expected =
+               ColumnVector.fromStrings("Repl and Repl with Repl", "nothing", null, "Repl")) {
+        assertColumnsAreEqual(expected, actual);
+      }
+
+      try (ColumnVector actual = v.replaceRegex(regexProg, repl, 0)) {
+        assertColumnsAreEqual(v, actual);
+      }
+
+      try (ColumnVector actual = v.replaceRegex(regexProg, repl, 1);
+           ColumnVector expected =
+               ColumnVector.fromStrings("Repl and Title with title", "nothing", null, "Repl")) {
+        assertColumnsAreEqual(expected, actual);
+      }
     }
   }
 
@@ -5188,45 +5201,55 @@ void testReplaceMultiRegex() {
   @Test
   void testStringReplaceWithBackrefs() {
 
-    try (ColumnVector v = ColumnVector.fromStrings("<h1>title</h1>", "<h1>another title</h1>",
-        null);
+    try (ColumnVector v = ColumnVector.fromStrings("<h1>title</h1>", "<h1>another title</h1>", null);
          ColumnVector expected = ColumnVector.fromStrings("<h2>title</h2>",
              "<h2>another title</h2>", null);
-         ColumnVector actual = v.stringReplaceWithBackrefs("<h1>(.*)</h1>", "<h2>\\1</h2>")) {
+         ColumnVector actual = v.stringReplaceWithBackrefs("<h1>(.*)</h1>", "<h2>\\1</h2>");
+         ColumnVector actualRe =
+             v.stringReplaceWithBackrefs(new RegexProgram("<h1>(.*)</h1>"), "<h2>\\1</h2>")) {
       assertColumnsAreEqual(expected, actual);
+      assertColumnsAreEqual(expected, actualRe);
     }
 
     try (ColumnVector v = ColumnVector.fromStrings("2020-1-01", "2020-2-02", null);
          ColumnVector expected = ColumnVector.fromStrings("2020-01-01", "2020-02-02", null);
-         ColumnVector actual = v.stringReplaceWithBackrefs("-([0-9])-", "-0\\1-")) {
+         ColumnVector actual = v.stringReplaceWithBackrefs("-([0-9])-", "-0\\1-");
+         ColumnVector actualRe =
+             v.stringReplaceWithBackrefs(new RegexProgram("-([0-9])-"), "-0\\1-")) {
       assertColumnsAreEqual(expected, actual);
+      assertColumnsAreEqual(expected, actualRe);
     }
 
-    try (ColumnVector v = ColumnVector.fromStrings("2020-01-1", "2020-02-2",
-        "2020-03-3invalid", null);
+    try (ColumnVector v = ColumnVector.fromStrings("2020-01-1", "2020-02-2", "2020-03-3invalid", null);
          ColumnVector expected = ColumnVector.fromStrings("2020-01-01", "2020-02-02",
              "2020-03-3invalid", null);
-         ColumnVector actual = v.stringReplaceWithBackrefs(
-             "-([0-9])$", "-0\\1")) {
+         ColumnVector actual = v.stringReplaceWithBackrefs("-([0-9])$", "-0\\1");
+         ColumnVector actualRe =
+             v.stringReplaceWithBackrefs(new RegexProgram("-([0-9])$"), "-0\\1")) {
       assertColumnsAreEqual(expected, actual);
+      assertColumnsAreEqual(expected, actualRe);
     }
 
     try (ColumnVector v = ColumnVector.fromStrings("2020-01-1 random_text", "2020-02-2T12:34:56",
-        "2020-03-3invalid", null);
+             "2020-03-3invalid", null);
          ColumnVector expected = ColumnVector.fromStrings("2020-01-01 random_text",
              "2020-02-02T12:34:56", "2020-03-3invalid", null);
-         ColumnVector actual = v.stringReplaceWithBackrefs(
-             "-([0-9])([ T])", "-0\\1\\2")) {
+         ColumnVector actual = v.stringReplaceWithBackrefs("-([0-9])([ T])", "-0\\1\\2");
+         ColumnVector actualRe =
+             v.stringReplaceWithBackrefs(new RegexProgram("-([0-9])([ T])"), "-0\\1\\2")) {
       assertColumnsAreEqual(expected, actual);
+      assertColumnsAreEqual(expected, actualRe);
     }
 
     // test zero as group index
     try (ColumnVector v = ColumnVector.fromStrings("aa-11 b2b-345", "aa-11a 1c-2b2 b2-c3", "11-aa", null);
          ColumnVector expected = ColumnVector.fromStrings("aa-11:aa:11; b2b-345:b:345;",
              "aa-11:aa:11;a 1c-2:c:2;b2 b2-c3", "11-aa", null);
-         ColumnVector actual = v.stringReplaceWithBackrefs(
-             "([a-z]+)-([0-9]+)", "${0}:${1}:${2};")) {
+         ColumnVector actual = v.stringReplaceWithBackrefs("([a-z]+)-([0-9]+)", "${0}:${1}:${2};");
+         ColumnVector actualRe =
+             v.stringReplaceWithBackrefs(new RegexProgram("([a-z]+)-([0-9]+)"), "${0}:${1}:${2};")) {
       assertColumnsAreEqual(expected, actual);
+      assertColumnsAreEqual(expected, actualRe);
     }
 
     // group index exceeds group count
@@ -5236,6 +5259,13 @@ void testStringReplaceWithBackrefs() {
       }
     });
 
+    // group index exceeds group count
+    assertThrows(CudfException.class, () -> {
+      try (ColumnVector v = ColumnVector.fromStrings("ABC123defgh");
+           ColumnVector r =
+               v.stringReplaceWithBackrefs(new RegexProgram("([A-Z]+)([0-9]+)([a-z]+)"), "\\4")) {
+      }
+    });
   }
 
   @Test
diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index c5b330fd89c..1fea3c7a37e 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -519,6 +519,23 @@ def test_groupby_apply_jit_args(func, args, groupby_jit_data):
     run_groupby_apply_jit_test(groupby_jit_data, func, ["key1", "key2"], *args)
 
 
+def test_groupby_apply_jit_block_divergence():
+    # https://github.com/rapidsai/cudf/issues/12686
+    df = cudf.DataFrame(
+        {
+            "a": [0, 0, 0, 1, 1, 1],
+            "b": [1, 1, 1, 2, 3, 4],
+        }
+    )
+
+    def diverging_block(grp_df):
+        if grp_df["a"].mean() > 0:
+            return grp_df["b"].mean()
+        return 0
+
+    run_groupby_apply_jit_test(df, diverging_block, ["a"])
+
+
 @pytest.mark.parametrize("nelem", [2, 3, 100, 500, 1000])
 @pytest.mark.parametrize(
     "func",
diff --git a/python/cudf/udf_cpp/groupby/function.cu b/python/cudf/udf_cpp/groupby/function.cu
index f94f99c4b49..782371b8a44 100644
--- a/python/cudf/udf_cpp/groupby/function.cu
+++ b/python/cudf/udf_cpp/groupby/function.cu
@@ -284,7 +284,7 @@ extern "C" {
   __device__ int name##_##cname(return_type* numba_return_value, type* const data, int64_t size) \
   {                                                                                              \
     return_type const res = name<type>(data, size);                                              \
-    if (threadIdx.x == 0) { *numba_return_value = res; }                                         \
+    *numba_return_value   = res;                                                                 \
     __syncthreads();                                                                             \
     return 0;                                                                                    \
   }
@@ -309,8 +309,8 @@ extern "C" {
   __device__ int name##_##cname(                                                 \
     int64_t* numba_return_value, type* const data, int64_t* index, int64_t size) \
   {                                                                              \
-    auto const res = name<type>(data, index, size);                              \
-    if (threadIdx.x == 0) { *numba_return_value = res; }                         \
+    auto const res      = name<type>(data, index, size);                         \
+    *numba_return_value = res;                                                   \
     __syncthreads();                                                             \
     return 0;                                                                    \
   }
diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py
index b6be5ade6ba..821ec103204 100644
--- a/python/dask_cudf/dask_cudf/backends.py
+++ b/python/dask_cudf/dask_cudf/backends.py
@@ -11,6 +11,10 @@
 
 import dask.dataframe as dd
 from dask import config
+from dask.dataframe.backends import (
+    DataFrameBackendEntrypoint,
+    PandasBackendEntrypoint,
+)
 from dask.dataframe.core import get_parallel_type, meta_nonempty
 from dask.dataframe.dispatch import (
     categorical_dtype_dispatch,
@@ -30,7 +34,7 @@
     make_meta_obj,
 )
 from dask.sizeof import sizeof as sizeof_dispatch
-from dask.utils import is_arraylike
+from dask.utils import Dispatch, is_arraylike
 
 import cudf
 from cudf.api.types import is_string_dtype
@@ -446,91 +450,127 @@ def _default_backend(func, *args, **kwargs):
         return func(*args, **kwargs)
 
 
-try:
+def _unsupported_kwargs(old, new, kwargs):
+    # Utility to raise a meaningful error when
+    # unsupported kwargs are encountered within
+    # ``to_backend_dispatch``
+    if kwargs:
+        raise ValueError(
+            f"Unsupported key-word arguments used in `to_backend` "
+            f"for {old}-to-{new} conversion: {kwargs}"
+        )
 
-    # Define "cudf" backend engine to be registered with Dask
-    from dask.dataframe.backends import DataFrameBackendEntrypoint
-
-    class CudfBackendEntrypoint(DataFrameBackendEntrypoint):
-        """Backend-entrypoint class for Dask-DataFrame
-
-        This class is registered under the name "cudf" for the
-        ``dask.dataframe.backends`` entrypoint in ``setup.cfg``.
-        Dask-DataFrame will use the methods defined in this class
-        in place of ``dask.dataframe.<creation-method>`` when the
-        "dataframe.backend" configuration is set to "cudf":
-
-        Examples
-        --------
-        >>> import dask
-        >>> import dask.dataframe as dd
-        >>> with dask.config.set({"dataframe.backend": "cudf"}):
-        ...     ddf = dd.from_dict({"a": range(10)})
-        >>> type(ddf)
-        <class 'dask_cudf.core.DataFrame'>
-        """
-
-        @staticmethod
-        def from_dict(
-            data,
-            npartitions,
-            orient="columns",
-            dtype=None,
-            columns=None,
-            constructor=cudf.DataFrame,
-        ):
-
-            return _default_backend(
-                dd.from_dict,
-                data,
-                npartitions=npartitions,
-                orient=orient,
-                dtype=dtype,
-                columns=columns,
-                constructor=constructor,
-            )
 
-        @staticmethod
-        def read_parquet(*args, engine=None, **kwargs):
-            from dask_cudf.io.parquet import CudfEngine
+# Register cudf->pandas
+to_pandas_dispatch = PandasBackendEntrypoint.to_backend_dispatch()
 
-            return _default_backend(
-                dd.read_parquet,
-                *args,
-                engine=CudfEngine,
-                **kwargs,
-            )
 
-        @staticmethod
-        def read_json(*args, **kwargs):
-            from dask_cudf.io.json import read_json
+@to_pandas_dispatch.register((cudf.DataFrame, cudf.Series, cudf.Index))
+def to_pandas_dispatch_from_cudf(data, nullable=False, **kwargs):
+    _unsupported_kwargs("cudf", "pandas", kwargs)
+    return data.to_pandas(nullable=nullable)
 
-            return read_json(*args, **kwargs)
 
-        @staticmethod
-        def read_orc(*args, **kwargs):
-            from dask_cudf.io import read_orc
+# Register pandas->cudf
+to_cudf_dispatch = Dispatch("to_cudf_dispatch")
 
-            return read_orc(*args, **kwargs)
 
-        @staticmethod
-        def read_csv(*args, **kwargs):
-            from dask_cudf.io import read_csv
+@to_cudf_dispatch.register((pd.DataFrame, pd.Series, pd.Index))
+def to_cudf_dispatch_from_pandas(data, nan_as_null=None, **kwargs):
+    _unsupported_kwargs("pandas", "cudf", kwargs)
+    return cudf.from_pandas(data, nan_as_null=nan_as_null)
 
-            return read_csv(*args, **kwargs)
 
-        @staticmethod
-        def read_hdf(*args, **kwargs):
-            from dask_cudf import from_dask_dataframe
+# Define "cudf" backend engine to be registered with Dask
+class CudfBackendEntrypoint(DataFrameBackendEntrypoint):
+    """Backend-entrypoint class for Dask-DataFrame
 
-            # HDF5 reader not yet implemented in cudf
-            warnings.warn(
-                "read_hdf is not yet implemented in cudf/dask_cudf. "
-                "Moving to cudf from pandas. Expect poor performance!"
-            )
-            return from_dask_dataframe(
-                _default_backend(dd.read_hdf, *args, **kwargs)
-            )
+    This class is registered under the name "cudf" for the
+    ``dask.dataframe.backends`` entrypoint in ``setup.cfg``.
+    Dask-DataFrame will use the methods defined in this class
+    in place of ``dask.dataframe.<creation-method>`` when the
+    "dataframe.backend" configuration is set to "cudf":
 
-except ImportError:
-    pass
+    Examples
+    --------
+    >>> import dask
+    >>> import dask.dataframe as dd
+    >>> with dask.config.set({"dataframe.backend": "cudf"}):
+    ...     ddf = dd.from_dict({"a": range(10)})
+    >>> type(ddf)
+    <class 'dask_cudf.core.DataFrame'>
+    """
+
+    @classmethod
+    def to_backend_dispatch(cls):
+        return to_cudf_dispatch
+
+    @classmethod
+    def to_backend(cls, data: dd.core._Frame, **kwargs):
+        if isinstance(data._meta, (cudf.DataFrame, cudf.Series, cudf.Index)):
+            # Already a cudf-backed collection
+            _unsupported_kwargs("cudf", "cudf", kwargs)
+            return data
+        return data.map_partitions(cls.to_backend_dispatch(), **kwargs)
+
+    @staticmethod
+    def from_dict(
+        data,
+        npartitions,
+        orient="columns",
+        dtype=None,
+        columns=None,
+        constructor=cudf.DataFrame,
+    ):
+
+        return _default_backend(
+            dd.from_dict,
+            data,
+            npartitions=npartitions,
+            orient=orient,
+            dtype=dtype,
+            columns=columns,
+            constructor=constructor,
+        )
+
+    @staticmethod
+    def read_parquet(*args, engine=None, **kwargs):
+        from dask_cudf.io.parquet import CudfEngine
+
+        return _default_backend(
+            dd.read_parquet,
+            *args,
+            engine=CudfEngine,
+            **kwargs,
+        )
+
+    @staticmethod
+    def read_json(*args, **kwargs):
+        from dask_cudf.io.json import read_json
+
+        return read_json(*args, **kwargs)
+
+    @staticmethod
+    def read_orc(*args, **kwargs):
+        from dask_cudf.io import read_orc
+
+        return read_orc(*args, **kwargs)
+
+    @staticmethod
+    def read_csv(*args, **kwargs):
+        from dask_cudf.io import read_csv
+
+        return read_csv(*args, **kwargs)
+
+    @staticmethod
+    def read_hdf(*args, **kwargs):
+        from dask_cudf import from_dask_dataframe
+
+        # HDF5 reader not yet implemented in cudf
+        warnings.warn(
+            "read_hdf is not yet implemented in cudf/dask_cudf. "
+            "Moving to cudf from pandas. Expect poor performance!"
+        )
+        return from_dask_dataframe(
+            _default_backend(dd.read_hdf, *args, **kwargs)
+        )
diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py
index ee8229bc7e8..7f8876c8564 100644
--- a/python/dask_cudf/dask_cudf/tests/test_core.py
+++ b/python/dask_cudf/dask_cudf/tests/test_core.py
@@ -6,6 +6,7 @@
 import numpy as np
 import pandas as pd
 import pytest
+from packaging import version
 
 import dask
 from dask import dataframe as dd
@@ -31,6 +32,58 @@ def test_from_dict_backend_dispatch():
     dd.assert_eq(expect, ddf)
 
 
+def test_to_backend():
+    np.random.seed(0)
+    data = {
+        "x": np.random.randint(0, 5, size=10000),
+        "y": np.random.normal(size=10000),
+    }
+    with dask.config.set({"dataframe.backend": "pandas"}):
+        ddf = dd.from_dict(data, npartitions=2)
+        assert isinstance(ddf._meta, pd.DataFrame)
+
+        gdf = ddf.to_backend("cudf")
+        assert isinstance(gdf, dgd.DataFrame)
+        dd.assert_eq(cudf.DataFrame(data), ddf)
+
+        assert isinstance(gdf.to_backend()._meta, pd.DataFrame)
+
+
+def test_to_backend_kwargs():
+    data = {"x": [0, 2, np.nan, 3, 4, 5]}
+    with dask.config.set({"dataframe.backend": "pandas"}):
+        dser = dd.from_dict(data, npartitions=2)["x"]
+        assert isinstance(dser._meta, pd.Series)
+
+        # Using `nan_as_null=False` will result in a cudf-backed
+        # Series with a NaN element (ranther than <NA>)
+        gser_nan = dser.to_backend("cudf", nan_as_null=False)
+        assert isinstance(gser_nan, dgd.Series)
+        assert np.isnan(gser_nan.compute()).sum() == 1
+
+        # Using `nan_as_null=True` will result in a cudf-backed
+        # Series with a <NA> element (ranther than NaN)
+        gser_null = dser.to_backend("cudf", nan_as_null=True)
+        assert isinstance(gser_null, dgd.Series)
+        assert np.isnan(gser_null.compute()).sum() == 0
+
+        # Check `nullable` argument for `cudf.Series.to_pandas`
+        dser_null = gser_null.to_backend("pandas", nullable=False)
+        assert dser_null.compute().dtype == "float"
+        dser_null = gser_null.to_backend("pandas", nullable=True)
+        assert isinstance(dser_null.compute().dtype, pd.Float64Dtype)
+
+        # Check unsupported arguments
+        with pytest.raises(ValueError, match="pandas-to-cudf"):
+            dser.to_backend("cudf", bad_arg=True)
+
+        with pytest.raises(ValueError, match="cudf-to-cudf"):
+            gser_null.to_backend("cudf", bad_arg=True)
+
+        with pytest.raises(ValueError, match="cudf-to-pandas"):
+            gser_null.to_backend("pandas", bad_arg=True)
+
+
 def test_from_cudf():
     np.random.seed(0)
 
@@ -547,8 +600,6 @@ def test_unary_ops(func, gdf, gddf):
 
     # Fixed in https://github.com/dask/dask/pull/4657
     if isinstance(p, cudf.Index):
-        from packaging import version
-
         if version.parse(dask.__version__) < version.parse("1.1.6"):
             pytest.skip(
                 "dask.dataframe assert_eq index check hardcoded to "