From d8fbb192441715b51f696620d19ce92b56022bdd Mon Sep 17 00:00:00 2001
From: Alfred Xu <lovedreamf@gmail.com>
Date: Thu, 3 Jun 2021 23:02:01 +0800
Subject: [PATCH 01/15] Fix null_equality config of rolling_collect_set (#8415)

Fix #8405, and add some tests for various `null_equality` and `nan_equality`.

Authors:
  - Alfred Xu (https://github.com/sperlingxx)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - MithunR (https://github.com/mythrocks)

URL: https://github.com/rapidsai/cudf/pull/8415
---
 cpp/src/rolling/rolling_detail.cuh      |   7 +-
 cpp/tests/groupby/collect_set_tests.cpp |  41 ++++++
 cpp/tests/rolling/collect_ops_test.cpp  | 185 ++++++++++++++++++++++--
 3 files changed, 217 insertions(+), 16 deletions(-)
diff --git a/cpp/src/rolling/rolling_detail.cuh b/cpp/src/rolling/rolling_detail.cuh
index 56b8bad0bac..525ed31ad82 100644
--- a/cpp/src/rolling/rolling_detail.cuh
+++ b/cpp/src/rolling/rolling_detail.cuh
@@ -746,11 +746,8 @@ class rolling_aggregation_postprocessor final : public cudf::detail::aggregation
                                                      stream,
                                                      rmm::mr::get_current_device_resource());
 
-    result = lists::detail::drop_list_duplicates(lists_column_view(collected_list->view()),
-                                                 null_equality::EQUAL,
-                                                 nan_equality::UNEQUAL,
-                                                 stream,
-                                                 mr);
+    result = lists::detail::drop_list_duplicates(
+      lists_column_view(collected_list->view()), agg._nulls_equal, agg._nans_equal, stream, mr);
   }
 
   std::unique_ptr<column> get_result()
diff --git a/cpp/tests/groupby/collect_set_tests.cpp b/cpp/tests/groupby/collect_set_tests.cpp
index d5a881a1993..8ce0380ad66 100644
--- a/cpp/tests/groupby/collect_set_tests.cpp
+++ b/cpp/tests/groupby/collect_set_tests.cpp
@@ -146,6 +146,47 @@ TEST_F(CollectSetTest, StringInput)
   test_single_agg(keys, vals, keys_expected, vals_expected, CollectSetTest::collect_set());
 }
 
+TEST_F(CollectSetTest, FloatsWithNaN)
+{
+  COL_K keys{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  cudf::test::fixed_width_column_wrapper<float> vals{
+    {1.0f, 1.0f, -2.3e-5f, -2.3e-5f, 2.3e5f, 2.3e5f, -NAN, -NAN, NAN, NAN, 0.0f, 0.0f},
+    {true, true, true, true, true, true, true, true, true, true, false, false}};
+  COL_K keys_expected{1};
+  // null equal with nan unequal
+  cudf::test::lists_column_wrapper<float> vals_expected{
+    {{-2.3e-5f, 1.0f, 2.3e5f, -NAN, -NAN, NAN, NAN, 0.0f},
+     VALIDITY{true, true, true, true, true, true, true, false}},
+  };
+  test_single_agg(keys, vals, keys_expected, vals_expected, CollectSetTest::collect_set());
+  // null unequal with nan unequal
+  vals_expected = {{{-2.3e-5f, 1.0f, 2.3e5f, -NAN, -NAN, NAN, NAN, 0.0f, 0.0f},
+                    VALIDITY{true, true, true, true, true, true, true, false, false}}};
+  test_single_agg(
+    keys, vals, keys_expected, vals_expected, CollectSetTest::collect_set_null_unequal());
+  // null exclude with nan unequal
+  vals_expected = {{-2.3e-5f, 1.0f, 2.3e5f, -NAN, -NAN, NAN, NAN}};
+  test_single_agg(
+    keys, vals, keys_expected, vals_expected, CollectSetTest::collect_set_null_exclude());
+  // null equal with nan equal
+  vals_expected = {{{-2.3e-5f, 1.0f, 2.3e5f, NAN, 0.0f}, VALIDITY{true, true, true, true, false}}};
+  test_single_agg(keys,
+                  vals,
+                  keys_expected,
+                  vals_expected,
+                  cudf::make_collect_set_aggregation(
+                    null_policy::INCLUDE, null_equality::EQUAL, nan_equality::ALL_EQUAL));
+  // null unequal with nan equal
+  vals_expected = {
+    {{-2.3e-5f, 1.0f, 2.3e5f, -NAN, 0.0f, 0.0f}, VALIDITY{true, true, true, true, false, false}}};
+  test_single_agg(keys,
+                  vals,
+                  keys_expected,
+                  vals_expected,
+                  cudf::make_collect_set_aggregation(
+                    null_policy::INCLUDE, null_equality::UNEQUAL, nan_equality::ALL_EQUAL));
+}
+
 TYPED_TEST(CollectSetTypedTest, CollectWithNulls)
 {
   // Just use an arbitrary value to store null entries
diff --git a/cpp/tests/rolling/collect_ops_test.cpp b/cpp/tests/rolling/collect_ops_test.cpp
index f97e13b49f1..8f4cd34fd35 100644
--- a/cpp/tests/rolling/collect_ops_test.cpp
+++ b/cpp/tests/rolling/collect_ops_test.cpp
@@ -1661,16 +1661,16 @@ TYPED_TEST(TypedCollectSetTest, BasicGroupedRollingWindowWithNulls)
 
   using T = TypeParam;
 
-  auto const group_column = fixed_width_column_wrapper<int32_t>{1, 1, 1, 1, 1, 2, 2, 2, 2};
+  auto const group_column = fixed_width_column_wrapper<int32_t>{1, 1, 1, 1, 1, 2, 2, 2, 2, 2};
   auto const input_column = fixed_width_column_wrapper<T, int32_t>{
-    {10, 11, 12, 13, 13, 20, 21, 21, 23}, {1, 0, 0, 1, 1, 1, 0, 1, 1}};
+    {10, 0, 0, 13, 13, 20, 21, 0, 0, 23}, {1, 0, 0, 1, 1, 1, 1, 0, 0, 1}};
 
   auto const preceding   = 2;
   auto const following   = 1;
   auto const min_periods = 1;
 
   {
-    // Nulls included.
+    // Nulls included and nulls are equal.
     auto const result =
       grouped_rolling_window(table_view{std::vector<column_view>{group_column}},
                              input_column,
@@ -1679,10 +1679,78 @@ TYPED_TEST(TypedCollectSetTest, BasicGroupedRollingWindowWithNulls)
                              min_periods,
                              *make_collect_set_aggregation<rolling_aggregation>());
     // Null values are sorted to the tails of lists (sets)
-    auto expected_child = fixed_width_column_wrapper<T, int32_t>{
-      {10, 11, 10, 11, 13, 11, 13, 12, 13, 20, 21, 20, 21, 21, 21, 23, 21, 21, 23},
-      {1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1}};
-    auto expected_offsets = fixed_width_column_wrapper<int32_t>{0, 2, 4, 6, 8, 9, 11, 14, 17, 19};
+    auto expected_child = fixed_width_column_wrapper<T, int32_t>{{
+                                                                   10, 0,      // row 0
+                                                                   10, 0,      // row 1
+                                                                   13, 0,      // row 2
+                                                                   13, 0,      // row 3
+                                                                   13,         // row 4
+                                                                   20, 21,     // row 5
+                                                                   20, 21, 0,  // row 6
+                                                                   21, 0,      // row 7
+                                                                   23, 0,      // row 8
+                                                                   23, 0,      // row 9
+                                                                 },
+                                                                 {
+                                                                   1, 0,     // row 0
+                                                                   1, 0,     // row 1
+                                                                   1, 0,     // row 2
+                                                                   1, 0,     // row 3
+                                                                   1,        // row 4
+                                                                   1, 1,     // row 5
+                                                                   1, 1, 0,  // row 6
+                                                                   1, 0,     // row 7
+                                                                   1, 0,     // row 8
+                                                                   1, 0      // row 9
+                                                                 }};
+    auto expected_offsets =
+      fixed_width_column_wrapper<int32_t>{0, 2, 4, 6, 8, 9, 11, 14, 16, 18, 20};
+
+    auto expected_result = make_lists_column(static_cast<column_view>(group_column).size(),
+                                             expected_offsets.release(),
+                                             expected_child.release(),
+                                             0,
+                                             {});
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result->view());
+  }
+
+  {
+    // Nulls included and nulls are NOT equal.
+    auto const result = grouped_rolling_window(table_view{std::vector<column_view>{group_column}},
+                                               input_column,
+                                               preceding,
+                                               following,
+                                               min_periods,
+                                               *make_collect_set_aggregation<rolling_aggregation>(
+                                                 null_policy::INCLUDE, null_equality::UNEQUAL));
+    // Null values are sorted to the tails of lists (sets)
+    auto expected_child = fixed_width_column_wrapper<T, int32_t>{{
+                                                                   10, 0,      // row 0
+                                                                   10, 0,  0,  // row 1
+                                                                   13, 0,  0,  // row 2
+                                                                   13, 0,      // row 3
+                                                                   13,         // row 4
+                                                                   20, 21,     // row 5
+                                                                   20, 21, 0,  // row 6
+                                                                   21, 0,  0,  // row 7
+                                                                   23, 0,  0,  // row 8
+                                                                   23, 0       // row 9
+                                                                 },
+                                                                 {
+                                                                   1, 0,     // row 0
+                                                                   1, 0, 0,  // row 1
+                                                                   1, 0, 0,  // row 2
+                                                                   1, 0,     // row 3
+                                                                   1,        // row 4
+                                                                   1, 1,     // row 5
+                                                                   1, 1, 0,  // row 6
+                                                                   1, 0, 0,  // row 7
+                                                                   1, 0, 0,  // row 8
+                                                                   1, 0      // row 9
+                                                                 }};
+    auto expected_offsets =
+      fixed_width_column_wrapper<int32_t>{0, 2, 5, 8, 10, 11, 13, 16, 19, 22, 24};
 
     auto expected_result = make_lists_column(static_cast<column_view>(group_column).size(),
                                              expected_offsets.release(),
@@ -1703,10 +1771,22 @@ TYPED_TEST(TypedCollectSetTest, BasicGroupedRollingWindowWithNulls)
       min_periods,
       *make_collect_set_aggregation<rolling_aggregation>(null_policy::EXCLUDE));
 
-    auto expected_child =
-      fixed_width_column_wrapper<T, int32_t>{10, 10, 13, 13, 13, 20, 20, 21, 21, 23, 21, 23};
-
-    auto expected_offsets = fixed_width_column_wrapper<int32_t>{0, 1, 2, 3, 4, 5, 6, 8, 10, 12};
+    auto expected_child = fixed_width_column_wrapper<T, int32_t>{
+      10,  // row 0
+      10,  // row 1
+      13,  // row 2
+      13,  // row 3
+      13,  // row 4
+      20,
+      21,  // row 5
+      20,
+      21,  // row 6
+      21,  // row 7
+      23,  // row 8
+      23   // row 9
+    };
+
+    auto expected_offsets = fixed_width_column_wrapper<int32_t>{0, 1, 2, 3, 4, 5, 7, 9, 10, 11, 12};
 
     auto expected_result = make_lists_column(static_cast<column_view>(group_column).size(),
                                              expected_offsets.release(),
@@ -1957,6 +2037,68 @@ TEST_F(CollectSetTest, BoolGroupedRollingWindow)
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_with_nulls_excluded->view());
 }
 
+TEST_F(CollectSetTest, FloatGroupedRollingWindowWithNaNs)
+{
+  using namespace cudf;
+  using namespace cudf::test;
+
+  auto const group_column = fixed_width_column_wrapper<int32_t>{1, 1, 1, 1, 1, 2, 2, 2, 2};
+  auto const input_column = fixed_width_column_wrapper<double>{
+    {1.23, 0.2341, 0.2341, -5.23e9, std::nan("1"), 1.1, std::nan("1"), std::nan("1"), 0.0},
+    {true, true, true, true, true, true, true, true, false}};
+
+  auto const preceding   = 2;
+  auto const following   = 1;
+  auto const min_periods = 1;
+  // test on nan_equality::UNEQUAL
+  auto const result = grouped_rolling_window(table_view{std::vector<column_view>{group_column}},
+                                             input_column,
+                                             preceding,
+                                             following,
+                                             min_periods,
+                                             *make_collect_set_aggregation<rolling_aggregation>());
+
+  auto const expected_result = lists_column_wrapper<double>{
+    {{0.2341, 1.23}, std::initializer_list<bool>{true, true}},
+    {{0.2341, 1.23}, std::initializer_list<bool>{true, true}},
+    {{-5.23e9, 0.2341}, std::initializer_list<bool>{true, true}},
+    {{-5.23e9, 0.2341, std::nan("1")}, std::initializer_list<bool>{true, true, true}},
+    {{-5.23e9, std::nan("1")}, std::initializer_list<bool>{true, true}},
+    {{1.1, std::nan("1")}, std::initializer_list<bool>{true, true}},
+    {{1.1, std::nan("1"), std::nan("1")}, std::initializer_list<bool>{true, true, true}},
+    {{std::nan("1"), std::nan("1"), 0.0}, std::initializer_list<bool>{true, true, false}},
+    {{std::nan("1"), 0.0},
+     std::initializer_list<bool>{
+       true, false}}}.release();
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result->view());
+
+  // test on nan_equality::ALL_EQUAL
+  auto const result_nan_equal =
+    grouped_rolling_window(table_view{std::vector<column_view>{group_column}},
+                           input_column,
+                           preceding,
+                           following,
+                           min_periods,
+                           *make_collect_set_aggregation<rolling_aggregation>(
+                             null_policy::INCLUDE, null_equality::EQUAL, nan_equality::ALL_EQUAL));
+
+  auto const expected_result_nan_equal = lists_column_wrapper<double>{
+    {{0.2341, 1.23}, std::initializer_list<bool>{true, true}},
+    {{0.2341, 1.23}, std::initializer_list<bool>{true, true}},
+    {{-5.23e9, 0.2341}, std::initializer_list<bool>{true, true}},
+    {{-5.23e9, 0.2341, std::nan("1")}, std::initializer_list<bool>{true, true, true}},
+    {{-5.23e9, std::nan("1")}, std::initializer_list<bool>{true, true}},
+    {{1.1, std::nan("1")}, std::initializer_list<bool>{true, true}},
+    {{1.1, std::nan("1")}, std::initializer_list<bool>{true, true}},
+    {{std::nan("1"), 0.0}, std::initializer_list<bool>{true, false}},
+    {{std::nan("1"), 0.0},
+     std::initializer_list<bool>{true,
+                                 false}}}.release();
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result_nan_equal->view(), result_nan_equal->view());
+}
+
 TEST_F(CollectSetTest, BasicRollingWindowWithNaNs)
 {
   using namespace cudf;
@@ -2002,6 +2144,27 @@ TEST_F(CollectSetTest, BasicRollingWindowWithNaNs)
                    *make_collect_set_aggregation<rolling_aggregation>(null_policy::EXCLUDE));
 
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_with_nulls_excluded->view());
+
+  auto const expected_result_for_nan_equal =
+    lists_column_wrapper<double>{
+      {0.2341, 1.23},
+      {0.2341, 1.23, std::nan("1")},
+      {0.2341, std::nan("1")},
+      {-5.23e9, std::nan("1")},
+      {-5.23e9, std::nan("1")},
+    }
+      .release();
+
+  auto const result_with_nan_equal =
+    rolling_window(input_column,
+                   2,
+                   1,
+                   1,
+                   *make_collect_set_aggregation<rolling_aggregation>(
+                     null_policy::INCLUDE, null_equality::EQUAL, nan_equality::ALL_EQUAL));
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result_for_nan_equal->view(),
+                                      result_with_nan_equal->view());
 }
 
 TEST_F(CollectSetTest, ListTypeRollingWindow)

From 0712ffa98f9fb6cd9fcc40b138835aeebeea74b2 Mon Sep 17 00:00:00 2001
From: "Robert (Bobby) Evans" <bobby@apache.org>
Date: Thu, 3 Jun 2021 16:13:14 -0500
Subject: [PATCH 02/15] JNI bindings for get_element (#8433)

Just like it says JNI bindings for get_element

Authors:
  - Robert (Bobby) Evans (https://github.com/revans2)

Approvers:
  - Jason Lowe (https://github.com/jlowe)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/8433
---
 .../main/java/ai/rapids/cudf/ColumnView.java  | 15 +++
 java/src/main/native/src/ColumnViewJni.cpp    | 14 +++
 .../java/ai/rapids/cudf/ColumnVectorTest.java | 94 +++++++++++++++++++
 3 files changed, 123 insertions(+)

diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java
index d3b09c3b2bd..5529ca1408c 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnView.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java
@@ -2738,6 +2738,19 @@ public final ColumnVector listContainsColumn(ColumnView key) {
     return new ColumnVector(listContainsColumn(getNativeView(), key.getNativeView()));
   }
 
+  /**
+   * Get a single item from the column at the specified index as a Scalar.
+   *
+   * Be careful. This is expensive and may involve running a kernel to copy the data out.
+   *
+   * @param index the index to look at
+   * @return the value at that index as a scalar.
+   * @throws CudfException if the index is out of bounds.
+   */
+  public final Scalar getScalarElement(int index) {
+    return new Scalar(getType(), getElement(getNativeView(), index));
+  }
+
   /////////////////////////////////////////////////////////////////////////////
   // INTERNAL/NATIVE ACCESS
   /////////////////////////////////////////////////////////////////////////////
@@ -3042,6 +3055,8 @@ private static native long stringReplaceWithBackrefs(long columnView, String pat
    */
   private static native long listContainsColumn(long nativeView, long keyColumn);
 
+  private static native long getElement(long nativeView, int index);
+
   private static native long castTo(long nativeHandle, int type, int scale);
 
   private static native long bitCastTo(long nativeHandle, int type, int scale);
diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp
index d433e8d36f5..e14a1a12fcc 100644
--- a/java/src/main/native/src/ColumnViewJni.cpp
+++ b/java/src/main/native/src/ColumnViewJni.cpp
@@ -228,6 +228,20 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_ifElseSS(JNIEnv *env, jcl
   CATCH_STD(env, 0);
 }
 
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getElement(JNIEnv *env, jclass,
+                                                                  jlong from,
+                                                                  jint index) {
+  JNI_NULL_CHECK(env, from, "from column is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    auto from_vec = reinterpret_cast<cudf::column_view *>(from);
+    std::unique_ptr<cudf::scalar> result =
+        cudf::get_element(*from_vec, index);
+    return reinterpret_cast<jlong>(result.release());
+  }
+  CATCH_STD(env, 0);
+}
+
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_reduce(JNIEnv *env, jclass,
                                                               jlong j_col_view,
                                                               jlong j_agg,
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index 08585746267..1f7751a7a37 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -168,6 +168,100 @@ void testClampInt() {
       }
   }
 
+  @Test
+  void testGetElementInt() {
+    try (ColumnVector cv = ColumnVector.fromBoxedInts(3, 2, 1, null);
+         Scalar s0 = cv.getScalarElement(0);
+         Scalar s1 = cv.getScalarElement(1);
+         Scalar s2 = cv.getScalarElement(2);
+         Scalar s3 = cv.getScalarElement(3)) {
+      assertEquals(3, s0.getInt());
+      assertEquals(2, s1.getInt());
+      assertEquals(1, s2.getInt());
+      assertFalse(s3.isValid());
+    }
+  }
+
+  @Test
+  void testGetElementByte() {
+    try (ColumnVector cv = ColumnVector.fromBoxedBytes((byte)3, (byte)2, (byte)1, null);
+         Scalar s0 = cv.getScalarElement(0);
+         Scalar s1 = cv.getScalarElement(1);
+         Scalar s2 = cv.getScalarElement(2);
+         Scalar s3 = cv.getScalarElement(3)) {
+      assertEquals(3, s0.getByte());
+      assertEquals(2, s1.getByte());
+      assertEquals(1, s2.getByte());
+      assertFalse(s3.isValid());
+    }
+  }
+
+  @Test
+  void testGetElementFloat() {
+    try (ColumnVector cv = ColumnVector.fromBoxedFloats(3f, 2f, 1f, null);
+         Scalar s0 = cv.getScalarElement(0);
+         Scalar s1 = cv.getScalarElement(1);
+         Scalar s2 = cv.getScalarElement(2);
+         Scalar s3 = cv.getScalarElement(3)) {
+      assertEquals(3f, s0.getFloat());
+      assertEquals(2f, s1.getFloat());
+      assertEquals(1f, s2.getFloat());
+      assertFalse(s3.isValid());
+    }
+  }
+
+  @Test
+  void testGetElementString() {
+    try (ColumnVector cv = ColumnVector.fromStrings("3a", "2b", "1c", null);
+         Scalar s0 = cv.getScalarElement(0);
+         Scalar s1 = cv.getScalarElement(1);
+         Scalar s2 = cv.getScalarElement(2);
+         Scalar s3 = cv.getScalarElement(3)) {
+      assertEquals("3a", s0.getJavaString());
+      assertEquals("2b", s1.getJavaString());
+      assertEquals("1c", s2.getJavaString());
+      assertFalse(s3.isValid());
+    }
+  }
+
+  @Test
+  void testGetElementDecimal() {
+    try (ColumnVector cv = ColumnVector.decimalFromLongs(1,3, 2, 1, -1);
+         Scalar s0 = cv.getScalarElement(0);
+         Scalar s1 = cv.getScalarElement(1);
+         Scalar s2 = cv.getScalarElement(2);
+         Scalar s3 = cv.getScalarElement(3)) {
+      assertEquals(1, s0.getType().getScale());
+      assertEquals(new BigDecimal("3E+1"), s0.getBigDecimal());
+      assertEquals(new BigDecimal("2E+1"), s1.getBigDecimal());
+      assertEquals(new BigDecimal("1E+1"), s2.getBigDecimal());
+      assertEquals(new BigDecimal("-1E+1"), s3.getBigDecimal());
+    }
+  }
+
+  @Test
+  void testGetElementList() {
+    HostColumnVector.DataType dt = new HostColumnVector.ListType(true,
+        new HostColumnVector.BasicType(true, DType.INT32));
+    try (ColumnVector cv = ColumnVector.fromLists(dt, Arrays.asList(3, 2),
+        Arrays.asList(1), Arrays.asList(), null);
+         Scalar s0 = cv.getScalarElement(0);
+         ColumnView s0Cv = s0.getListAsColumnView();
+         ColumnVector expected0 = ColumnVector.fromInts(3, 2);
+         Scalar s1 = cv.getScalarElement(1);
+         ColumnView s1Cv = s1.getListAsColumnView();
+         ColumnVector expected1 = ColumnVector.fromInts(1);
+         Scalar s2 = cv.getScalarElement(2);
+         ColumnView s2Cv = s2.getListAsColumnView();
+         ColumnVector expected2 = ColumnVector.fromInts();
+         Scalar s3 = cv.getScalarElement(3)) {
+      assertColumnsAreEqual(expected0, s0Cv);
+      assertColumnsAreEqual(expected1, s1Cv);
+      assertColumnsAreEqual(expected2, s2Cv);
+      assertFalse(s3.isValid());
+    }
+  }
+
  @Test
   void testStringCreation() {
     try (ColumnVector cv = ColumnVector.fromStrings("d", "sd", "sde", null, "END");

From 37cdc71b643312c42b025b1e3055c213507b2457 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Thu, 3 Jun 2021 19:22:02 -0500
Subject: [PATCH 03/15] Update dask make_meta changes to be compatible with
 dask upstream (#8426)

This PR updates the `make_meta` dispatch renaming to be compatible and back-ward compatible with latest dask and older versions of dask respectively.

This PR needs upstream dask changes: https://github.com/dask/dask/pull/7743

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - https://github.com/jakirkham

URL: https://github.com/rapidsai/cudf/pull/8426
---
 python/dask_cudf/dask_cudf/backends.py        | 14 ++++++++++----
 python/dask_cudf/dask_cudf/core.py            | 13 +++++++------
 python/dask_cudf/dask_cudf/tests/test_core.py | 12 +++---------
 3 files changed, 20 insertions(+), 19 deletions(-)

diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py
index 1c66f256d19..7b8bd5c3f87 100644
--- a/python/dask_cudf/dask_cudf/backends.py
+++ b/python/dask_cudf/dask_cudf/backends.py
@@ -18,7 +18,6 @@
     _scalar_from_dtype,
     is_arraylike,
     is_scalar,
-    make_meta,
 )
 
 try:
@@ -26,6 +25,13 @@
 except ImportError:
     from dask.dataframe.utils import make_meta as make_meta_obj
 
+try:
+    from dask.dataframe.dispatch import (
+        make_meta_dispatch as make_meta_dispatch,
+    )
+except ImportError:
+    from dask.dataframe.utils import make_meta as make_meta_dispatch
+
 import cudf
 from cudf.utils.dtypes import is_string_dtype
 
@@ -121,12 +127,12 @@ def meta_nonempty_cudf(x):
     return res
 
 
-@make_meta.register((cudf.Series, cudf.DataFrame))
+@make_meta_dispatch.register((cudf.Series, cudf.DataFrame))
 def make_meta_cudf(x, index=None):
     return x.head(0)
 
 
-@make_meta.register(cudf.Index)
+@make_meta_dispatch.register(cudf.Index)
 def make_meta_cudf_index(x, index=None):
     return x[:0]
 
@@ -173,7 +179,7 @@ def make_meta_object_cudf(x, index=None):
         return x[:0]
 
     if index is not None:
-        index = make_meta(index)
+        index = make_meta_dispatch(index)
 
     if isinstance(x, dict):
         return cudf.DataFrame(
diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py
index dc63c5c435e..8af6188e625 100644
--- a/python/dask_cudf/dask_cudf/core.py
+++ b/python/dask_cudf/dask_cudf/core.py
@@ -14,7 +14,13 @@
 from dask.compatibility import apply
 from dask.context import _globals
 from dask.core import flatten
-from dask.dataframe.core import Scalar, finalize, handle_out, map_partitions
+from dask.dataframe.core import (
+    Scalar,
+    finalize,
+    handle_out,
+    make_meta as dask_make_meta,
+    map_partitions,
+)
 from dask.dataframe.utils import raise_on_meta_error
 from dask.highlevelgraph import HighLevelGraph
 from dask.optimization import cull, fuse
@@ -26,11 +32,6 @@
 from dask_cudf import sorting
 from dask_cudf.accessors import ListMethods
 
-try:
-    from dask.dataframe.utils import make_meta_util as dask_make_meta
-except ImportError:
-    from dask.dataframe.core import make_meta as dask_make_meta
-
 DASK_VERSION = LooseVersion(dask.__version__)
 
 
diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py
index e7dff10b527..2f73534b45a 100644
--- a/python/dask_cudf/dask_cudf/tests/test_core.py
+++ b/python/dask_cudf/dask_cudf/tests/test_core.py
@@ -9,13 +9,7 @@
 
 import dask
 from dask import dataframe as dd
-from dask.dataframe.core import meta_nonempty
-
-try:
-    from dask.dataframe.utils import make_meta_util as dask_make_meta
-except ImportError:
-    from dask.dataframe.core import make_meta as dask_make_meta
-
+from dask.dataframe.core import make_meta as dask_make_meta, meta_nonempty
 from dask.utils import M
 
 import cudf
@@ -827,9 +821,9 @@ def test_merging_categorical_columns():
 
 def test_correct_meta():
     try:
-        from dask.dataframe.utils import make_meta_util  # noqa: F401
+        from dask.dataframe.dispatch import make_meta_obj  # noqa: F401
     except ImportError:
-        pytest.skip("need make_meta_util to be preset")
+        pytest.skip("need make_meta_obj to be preset")
 
     # Need these local imports in this specific order.
     # For context: https://github.com/rapidsai/cudf/issues/7946

From ad6e0bd7be204fd10d3c8aeb3be5f2d15560c22b Mon Sep 17 00:00:00 2001
From: Ahmet Uyar <32486572+ahmet-uyar@users.noreply.github.com>
Date: Fri, 4 Jun 2021 03:25:07 +0300
Subject: [PATCH 04/15] Fixed documentation bug in groupby agg method (#8325)

This is a non-functional documentation update.
The output of the c column is missing in the result of the first example in agg method:
python/cudf/cudf/core/groupby/groupby.py
I added the values of the c columns.

Authors:
  - Ahmet Uyar (https://github.com/ahmet-uyar)

Approvers:
  - Keith Kraus (https://github.com/kkraus14)
  - Nghia Truong (https://github.com/ttnghia)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/8325
---
 python/cudf/cudf/core/groupby/groupby.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 6a298df32d6..43476b4b781 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -139,10 +139,10 @@ def agg(self, func):
         >>> a = cudf.DataFrame(
             {'a': [1, 1, 2], 'b': [1, 2, 3], 'c': [2, 2, 1]})
         >>> a.groupby('a').agg('sum')
-           b
+           b  c
         a
-        2  3
-        1  3
+        2  3  1
+        1  3  4
 
         Specifying a list of aggregations to perform on each column.
 
@@ -347,7 +347,7 @@ def pipe(self, func, *args, **kwargs):
         >>> import cudf
         >>> df = cudf.DataFrame({'A': ['a', 'b', 'a', 'b'], 'B': [1, 2, 3, 4]})
         >>> df
-        A  B
+           A  B
         0  a  1
         1  b  2
         2  a  3
@@ -357,7 +357,7 @@ def pipe(self, func, *args, **kwargs):
         in one pass, you can do
 
         >>> df.groupby('A').pipe(lambda x: x.max() - x.min())
-        B
+           B
         A
         a  2
         b  2

From 6792be91ab571f24a4637216d25e179d25feda03 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Fri, 4 Jun 2021 08:28:31 -0400
Subject: [PATCH 05/15] Add public libcudf match_dictionaries API (#8429)

This PR creates a public API for the internal libcudf `cudf::dictionary::detail::match_dictionaries` function to help with transitioning the cudf python CategoricalColumn over to using the libcudf dictionary column.

No function has changed or been added but this PR does add a formal gtest for the new public API.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Devavret Makkar (https://github.com/devavret)

URL: https://github.com/rapidsai/cudf/pull/8429
---
 .../cudf/dictionary/detail/update_keys.hpp    | 14 ++++-------
 cpp/include/cudf/dictionary/update_keys.hpp   | 17 ++++++++++++-
 cpp/src/dictionary/replace.cu                 |  3 ++-
 cpp/src/dictionary/set_keys.cu                | 16 +++++++++---
 cpp/tests/dictionary/set_keys_test.cpp        | 25 ++++++++++++++++++-
 5 files changed, 59 insertions(+), 16 deletions(-)

diff --git a/cpp/include/cudf/dictionary/detail/update_keys.hpp b/cpp/include/cudf/dictionary/detail/update_keys.hpp
index 9d3cc9f90bc..8c037406e45 100644
--- a/cpp/include/cudf/dictionary/detail/update_keys.hpp
+++ b/cpp/include/cudf/dictionary/detail/update_keys.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 #include <cudf/column/column.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/table/table_view.hpp>
+#include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
@@ -72,18 +73,13 @@ std::unique_ptr<column> set_keys(
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
- * @brief Create new dictionaries that have keys merged from the input dictionaries.
+ * @copydoc
+ * cudf::dictionary::match_dictionaries(std::vector<cudf::dictionary_column_view>,mm::mr::device_memory_resource*)
  *
- * This will concatenate the keys for each dictionary and then call `set_keys` on each.
- * The result is a vector of new dictionaries with a common set of keys.
- *
- * @param input Dictionary columns to match keys.
- * @param mr Device memory resource used to allocate the returned column's device memory.
  * @param stream CUDA stream used for device memory operations and kernel launches.
- * @return New dictionary column.
  */
 std::vector<std::unique_ptr<column>> match_dictionaries(
-  std::vector<dictionary_column_view> input,
+  cudf::host_span<dictionary_column_view const> input,
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
diff --git a/cpp/include/cudf/dictionary/update_keys.hpp b/cpp/include/cudf/dictionary/update_keys.hpp
index 99a6c705edc..2b66a4d5072 100644
--- a/cpp/include/cudf/dictionary/update_keys.hpp
+++ b/cpp/include/cudf/dictionary/update_keys.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
+#include <cudf/utilities/span.hpp>
 
 namespace cudf {
 namespace dictionary {
@@ -139,6 +140,20 @@ std::unique_ptr<column> set_keys(
   column_view const& keys,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Create new dictionaries that have keys merged from the input dictionaries.
+ *
+ * This will concatenate the keys for each dictionary and then call `set_keys` on each.
+ * The result is a vector of new dictionaries with a common set of keys.
+ *
+ * @param input Dictionary columns to match keys.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ * @return New dictionary columns.
+ */
+std::vector<std::unique_ptr<column>> match_dictionaries(
+  cudf::host_span<dictionary_column_view const> input,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /** @} */  // end of group
 }  // namespace dictionary
 }  // namespace cudf
diff --git a/cpp/src/dictionary/replace.cu b/cpp/src/dictionary/replace.cu
index 9b644f38794..1dbb844a606 100644
--- a/cpp/src/dictionary/replace.cu
+++ b/cpp/src/dictionary/replace.cu
@@ -90,7 +90,8 @@ std::unique_ptr<column> replace_nulls(dictionary_column_view const& input,
   CUDF_EXPECTS(replacement.size() == input.size(), "column sizes must match");
 
   // first combine the keys so both input dictionaries have the same set
-  auto matched = match_dictionaries({input, replacement}, stream, mr);
+  auto matched =
+    match_dictionaries(std::vector<dictionary_column_view>({input, replacement}), stream, mr);
 
   // now build the new indices by doing replace-null using the updated input indices
   auto const input_indices =
diff --git a/cpp/src/dictionary/set_keys.cu b/cpp/src/dictionary/set_keys.cu
index 8f07c9cbbed..2e0ab389a9c 100644
--- a/cpp/src/dictionary/set_keys.cu
+++ b/cpp/src/dictionary/set_keys.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -150,9 +150,10 @@ std::unique_ptr<column> set_keys(
                                 new_nulls.second);
 }
 
-std::vector<std::unique_ptr<column>> match_dictionaries(std::vector<dictionary_column_view> input,
-                                                        rmm::cuda_stream_view stream,
-                                                        rmm::mr::device_memory_resource* mr)
+std::vector<std::unique_ptr<column>> match_dictionaries(
+  cudf::host_span<dictionary_column_view const> input,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
 {
   std::vector<column_view> keys(input.size());
   std::transform(input.begin(), input.end(), keys.begin(), [](auto& col) { return col.keys(); });
@@ -221,5 +222,12 @@ std::unique_ptr<column> set_keys(dictionary_column_view const& dictionary_column
   return detail::set_keys(dictionary_column, keys, rmm::cuda_stream_default, mr);
 }
 
+std::vector<std::unique_ptr<column>> match_dictionaries(
+  cudf::host_span<dictionary_column_view const> input, rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::match_dictionaries(input, rmm::cuda_stream_default, mr);
+}
+
 }  // namespace dictionary
 }  // namespace cudf
diff --git a/cpp/tests/dictionary/set_keys_test.cpp b/cpp/tests/dictionary/set_keys_test.cpp
index ebeb94e0ba9..9e15bc63740 100644
--- a/cpp/tests/dictionary/set_keys_test.cpp
+++ b/cpp/tests/dictionary/set_keys_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -84,3 +84,26 @@ TEST_F(DictionarySetKeysTest, Errors)
   cudf::test::fixed_width_column_wrapper<int64_t> null_keys{{1, 2, 3}, {1, 0, 1}};
   EXPECT_THROW(cudf::dictionary::set_keys(dictionary->view(), null_keys), cudf::logic_error);
 }
+
+TEST_F(DictionarySetKeysTest, MatchDictionaries)
+{
+  cudf::test::dictionary_column_wrapper<int32_t> col1{5, 0, 4, 1, 2, 2, 2, 5, 0};
+  cudf::test::dictionary_column_wrapper<int32_t> col2{1, 0, 3, 1, 4, 5, 6, 5, 0};
+
+  auto input = std::vector<cudf::dictionary_column_view>(
+    {cudf::dictionary_column_view(col1), cudf::dictionary_column_view(col2)});
+
+  auto results = cudf::dictionary::match_dictionaries(input);
+  auto keys1   = cudf::dictionary_column_view(results[0]->view()).keys();
+  auto keys2   = cudf::dictionary_column_view(results[1]->view()).keys();
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(keys1, keys2);
+
+  auto result1 = cudf::dictionary::decode(cudf::dictionary_column_view(results[0]->view()));
+  auto result2 = cudf::dictionary::decode(cudf::dictionary_column_view(results[1]->view()));
+
+  auto expected1 = cudf::dictionary::decode(cudf::dictionary_column_view(col1));
+  auto expected2 = cudf::dictionary::decode(cudf::dictionary_column_view(col2));
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result1->view(), expected1->view());
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result2->view(), expected2->view());
+}

From 79f42f9bd9f4cf7573dd5e8f10976dd8cb48e969 Mon Sep 17 00:00:00 2001
From: Alfred Xu <lovedreamf@gmail.com>
Date: Mon, 7 Jun 2021 10:54:56 +0800
Subject: [PATCH 06/15] JNI bindings for sort_lists (#8439)

This PR is to provide JNI wrapper for `cudf::lists::sort_lists', which sorts each row inside list_column. This feature is required by https://github.com/NVIDIA/spark-rapids/issues/2557.

In addition, this PR also update comments and tests on NaNEquality/NullEquality of rolling_collect_set.

Authors:
  - Alfred Xu (https://github.com/sperlingxx)

Approvers:
  - Jason Lowe (https://github.com/jlowe)

URL: https://github.com/rapidsai/cudf/pull/8439
---
 .../main/java/ai/rapids/cudf/Aggregation.java |   6 -
 .../main/java/ai/rapids/cudf/ColumnView.java  |  15 +++
 java/src/main/native/src/ColumnViewJni.cpp    |  17 +++
 .../java/ai/rapids/cudf/ColumnVectorTest.java | 112 ++++++++++++++++++
 .../test/java/ai/rapids/cudf/TableTest.java   |  36 +++++-
 5 files changed, 174 insertions(+), 12 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/Aggregation.java b/java/src/main/java/ai/rapids/cudf/Aggregation.java
index 28f7bd62da1..bf5e9bb5e35 100644
--- a/java/src/main/java/ai/rapids/cudf/Aggregation.java
+++ b/java/src/main/java/ai/rapids/cudf/Aggregation.java
@@ -300,10 +300,6 @@ public boolean equals(Object other) {
         }
     }
 
-    /**
-     * WARNING: For now, NullEquality of UNEQUAL and NaNEquality of ALL_EQUAL doesn't work,
-     * because of incorrect parameter passing in libcudf (https://github.com/rapidsai/cudf/issues/8405).
-     */
     public static final class CollectSetAggregation extends Aggregation
         implements RollingAggregation<CollectSetAggregation> {
         private final NullPolicy nullPolicy;
@@ -722,8 +718,6 @@ public static CollectSetAggregation collectSet() {
 
     /**
      * Collect the values into a set.
-     * WARNING: Due to the bug in libcudf (https://github.com/rapidsai/cudf/issues/8405),
-     * NullEquality of UNEQUAL and NaNEquality of ALL_EQUAL doesn't work properly under rolling window.
      *
      * @param nullPolicy   Indicates whether to include/exclude nulls during collection.
      * @param nullEquality Flag to specify whether null entries within each list should be considered equal.
diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java
index 5529ca1408c..db42a8c9ca2 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnView.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java
@@ -2738,6 +2738,19 @@ public final ColumnVector listContainsColumn(ColumnView key) {
     return new ColumnVector(listContainsColumn(getNativeView(), key.getNativeView()));
   }
 
+  /**
+   * Segmented sort of the elements within a list in each row of a list column.
+   * NOTICE: list columns with nested child are NOT supported yet.
+   *
+   * @param isDescending   whether sorting each row with descending order (or ascending order)
+   * @param isNullSmallest whether to regard the null value as the min value (or the max value)
+   * @return a List ColumnVector with elements in each list sorted
+   */
+  public final ColumnVector listSortRows(boolean isDescending, boolean isNullSmallest) {
+    assert type.equals(DType.LIST) : "column type must be a LIST";
+    return new ColumnVector(listSortRows(getNativeView(), isDescending, isNullSmallest));
+  }
+
   /**
    * Get a single item from the column at the specified index as a Scalar.
    *
@@ -3055,6 +3068,8 @@ private static native long stringReplaceWithBackrefs(long columnView, String pat
    */
   private static native long listContainsColumn(long nativeView, long keyColumn);
 
+  private static native long listSortRows(long nativeView, boolean isDescending, boolean isNullSmallest);
+
   private static native long getElement(long nativeView, int index);
 
   private static native long castTo(long nativeHandle, int type, int scale);
diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp
index e14a1a12fcc..d41ed97b4cb 100644
--- a/java/src/main/native/src/ColumnViewJni.cpp
+++ b/java/src/main/native/src/ColumnViewJni.cpp
@@ -26,6 +26,7 @@
 #include <cudf/lists/count_elements.hpp>
 #include <cudf/lists/detail/concatenate.hpp>
 #include <cudf/lists/extract.hpp>
+#include <cudf/lists/sorting.hpp>
 #include <cudf/null_mask.hpp>
 #include <cudf/quantiles.hpp>
 #include <cudf/reduction.hpp>
@@ -401,6 +402,22 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listContainsColumn(JNIEnv
   CATCH_STD(env, 0);
 }
 
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listSortRows(JNIEnv *env, jclass,
+                                                                    jlong column_view,
+                                                                    jboolean is_descending,
+                                                                    jboolean is_null_smallest) {
+  JNI_NULL_CHECK(env, column_view, "column is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    auto sort_order = is_descending ? cudf::order::DESCENDING : cudf::order::ASCENDING;
+    auto null_order = is_null_smallest ? cudf::null_order::BEFORE : cudf::null_order::AFTER;
+    auto *cv = reinterpret_cast<cudf::column_view *>(column_view);
+    auto ret = cudf::lists::sort_lists(cudf::lists_column_view(*cv), sort_order, null_order);
+    return reinterpret_cast<jlong>(ret.release());
+  }
+  CATCH_STD(env, 0);
+}
+
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_stringSplit(JNIEnv *env, jclass,
                                                                         jlong column_view,
                                                                         jlong delimiter) {
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index 1f7751a7a37..16570483f17 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -4005,6 +4005,118 @@ void testListContainsIntCol() {
     }
   }
 
+  @Test
+  void testListSortRowsWithIntChild() {
+    List<Integer> list1 = Arrays.asList(1, 3, 0, 2);
+    List<Integer> ascSortedList1 = Arrays.asList(0, 1, 2, 3);
+    List<Integer> decSortedList1 = Arrays.asList(3, 2, 1, 0);
+
+    List<Integer> list2 = Arrays.asList(7, 5, 6, 4);
+    List<Integer> ascSortedList2 = Arrays.asList(4, 5, 6, 7);
+    List<Integer> decSortedList2 = Arrays.asList(7, 6, 5, 4);
+
+    List<Integer> list3 = Arrays.asList(-8, null, -9, -10);
+    List<Integer> ascSortedList3 = Arrays.asList(-10, -9, -8, null);
+    List<Integer> ascSortedNullMinList3 = Arrays.asList(null, -10, -9, -8);
+    List<Integer> decSortedList3 = Arrays.asList(null, -8, -9, -10);
+    List<Integer> decSortedNullMinList3 = Arrays.asList(-8, -9, -10, null);
+
+    List<Integer> list4 = Arrays.asList(null, -12, null, 11);
+    List<Integer> ascSortedList4 = Arrays.asList(-12, 11, null, null);
+    List<Integer> ascSortedNullMinList4 = Arrays.asList(null, null, -12, 11);
+    List<Integer> decSortedList4 = Arrays.asList(null, null, 11, -12);
+    List<Integer> decSortedNullMinList4 = Arrays.asList(11, -12, null, null);
+
+    List<Integer> list5 = null;
+
+    HostColumnVector.ListType listType = new HostColumnVector.ListType(true,
+        new HostColumnVector.BasicType(true, DType.INT32));
+    // Ascending + NullLargest
+    try (ColumnVector v = ColumnVector.fromLists(listType, list1, list2, list3, list4, list5);
+         ColumnVector expected = ColumnVector.fromLists(listType,
+             ascSortedList1, ascSortedList2, ascSortedList3, ascSortedList4, list5);
+         ColumnVector result = v.listSortRows(false, false)) {
+      assertColumnsAreEqual(expected, result);
+    }
+    // Descending + NullLargest
+    try (ColumnVector v = ColumnVector.fromLists(listType, list1, list2, list3, list4, list5);
+         ColumnVector expected = ColumnVector.fromLists(listType,
+             decSortedList1, decSortedList2, decSortedList3, decSortedList4, list5);
+         ColumnVector result = v.listSortRows(true, false)) {
+      assertColumnsAreEqual(expected, result);
+    }
+    // Ascending + NullSmallest
+    try (ColumnVector v = ColumnVector.fromLists(listType, list1, list2, list3, list4, list5);
+         ColumnVector expected = ColumnVector.fromLists(listType,
+             ascSortedList1, ascSortedList2, ascSortedNullMinList3, ascSortedNullMinList4, list5);
+         ColumnVector result = v.listSortRows(false, true)) {
+      assertColumnsAreEqual(expected, result);
+    }
+    // Descending + NullSmallest
+    try (ColumnVector v = ColumnVector.fromLists(listType, list1, list2, list3, list4, list5);
+         ColumnVector expected = ColumnVector.fromLists(listType,
+             decSortedList1, decSortedList2, decSortedNullMinList3, decSortedNullMinList4, list5);
+         ColumnVector result = v.listSortRows(true, true)) {
+      assertColumnsAreEqual(expected, result);
+    }
+  }
+
+  @Test
+  void testListSortRowsWithStringChild() {
+    List<String> list1 = Arrays.asList("b", "d", "a", "c");
+    List<String> ascSortedList1 = Arrays.asList("a", "b", "c", "d");
+    List<String> decSortedList1 = Arrays.asList("d", "c", "b", "a");
+
+    List<String> list2 = Arrays.asList("h", "f", "g", "e");
+    List<String> ascSortedList2 = Arrays.asList("e", "f", "g", "h");
+    List<String> decSortedList2 = Arrays.asList("h", "g", "f", "e");
+
+    List<String> list3 = Arrays.asList("C", null, "B", "A");
+    List<String> ascSortedList3 = Arrays.asList("A", "B", "C", null);
+    List<String> ascSortedNullMinList3 = Arrays.asList(null, "A", "B", "C");
+    List<String> decSortedList3 = Arrays.asList(null, "C", "B", "A");
+    List<String> decSortedNullMinList3 = Arrays.asList("C", "B", "A", null);
+
+    List<String> list4 = Arrays.asList(null, "D", null, "d");
+    List<String> ascSortedList4 = Arrays.asList("D", "d", null, null);
+    List<String> ascSortedNullMinList4 = Arrays.asList(null, null, "D", "d");
+    List<String> decSortedList4 = Arrays.asList(null, null, "d", "D");
+    List<String> decSortedNullMinList4 = Arrays.asList("d", "D", null, null);
+
+    List<String> list5 = null;
+
+    HostColumnVector.ListType listType = new HostColumnVector.ListType(true,
+        new HostColumnVector.BasicType(true, DType.STRING));
+    // Ascending + NullLargest
+    try (ColumnVector v = ColumnVector.fromLists(listType, list1, list2, list3, list4, list5);
+         ColumnVector expected = ColumnVector.fromLists(listType,
+             ascSortedList1, ascSortedList2, ascSortedList3, ascSortedList4, list5);
+         ColumnVector result = v.listSortRows(false, false)) {
+      assertColumnsAreEqual(expected, result);
+    }
+    // Descending + NullLargest
+    try (ColumnVector v = ColumnVector.fromLists(listType, list1, list2, list3, list4, list5);
+         ColumnVector expected = ColumnVector.fromLists(listType,
+             decSortedList1, decSortedList2, decSortedList3, decSortedList4, list5);
+         ColumnVector result = v.listSortRows(true, false)) {
+      assertColumnsAreEqual(expected, result);
+    }
+    // Ascending + NullSmallest
+    try (ColumnVector v = ColumnVector.fromLists(listType, list1, list2, list3, list4, list5);
+         ColumnVector expected = ColumnVector.fromLists(listType,
+             ascSortedList1, ascSortedList2, ascSortedNullMinList3, ascSortedNullMinList4, list5);
+         ColumnVector result = v.listSortRows(false, true)) {
+      assertColumnsAreEqual(expected, result);
+    }
+    // Descending + NullSmallest
+    try (ColumnVector v = ColumnVector.fromLists(listType, list1, list2, list3, list4, list5);
+         ColumnVector expected = ColumnVector.fromLists(listType,
+             decSortedList1, decSortedList2, decSortedNullMinList3, decSortedNullMinList4, list5);
+         ColumnVector result = v.listSortRows(true, true)) {
+      assertColumnsAreEqual(expected, result);
+    }
+  }
+
   @Test
   void testStringSplitRecord() {
       try (ColumnVector v = ColumnVector.fromStrings("Héllo there", "thésé", "null", "", "ARé some", "test strings");
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index 8716a214bdd..c94887d566d 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -3045,11 +3045,13 @@ void testWindowingCollectList() {
 
   @Test
   void testWindowingCollectSet() {
-    // TODO: Add test cases for aggCollectWithUnEqNulls and aggCollectWithEqNaNs after
-    // the issue (https://github.com/rapidsai/cudf/issues/8405) being addressed.
     Aggregation aggCollect = Aggregation.collectSet();
     Aggregation aggCollectWithEqNulls = Aggregation.collectSet(NullPolicy.INCLUDE,
         NullEquality.EQUAL, NaNEquality.UNEQUAL);
+    Aggregation aggCollectWithUnEqNulls = Aggregation.collectSet(NullPolicy.INCLUDE,
+        NullEquality.UNEQUAL, NaNEquality.UNEQUAL);
+    Aggregation aggCollectWithEqNaNs = Aggregation.collectSet(NullPolicy.INCLUDE,
+        NullEquality.EQUAL, NaNEquality.ALL_EQUAL);
 
     try (Scalar two = Scalar.fromInt(2);
          Scalar one = Scalar.fromInt(1);
@@ -3071,7 +3073,7 @@ void testWindowingCollectSet() {
           assertColumnsAreEqual(expectSortedAggColumn, sortedAggColumn);
 
           // Primitive type: INT32
-          //  a) excluding nulls
+          //  a) excluding NULLs
           try (Table windowAggResults = sorted.groupBy(0, 1)
               .aggregateWindows(aggCollect.onColumn(3).overWindow(winOpts));
                ColumnVector expected = ColumnVector.fromLists(
@@ -3081,7 +3083,7 @@ void testWindowingCollectSet() {
                    Arrays.asList(), Arrays.asList(6), Arrays.asList(6, 7), Arrays.asList(6, 7))) {
             assertColumnsAreEqual(expected, windowAggResults.getColumn(0));
           }
-          //  b) including nulls AND nulls are equal
+          //  b) including NULLs AND NULLs are equal
           try (Table windowAggResults = sorted.groupBy(0, 1)
               .aggregateWindows(aggCollectWithEqNulls.onColumn(3).overWindow(winOpts));
                ColumnVector expected = ColumnVector.fromLists(
@@ -3091,9 +3093,19 @@ void testWindowingCollectSet() {
                    Arrays.asList((Integer) null), Arrays.asList(6, null), Arrays.asList(6, 7, null), Arrays.asList(6, 7))) {
             assertColumnsAreEqual(expected, windowAggResults.getColumn(0));
           }
+          //  c) including NULLs AND NULLs are unequal
+          try (Table windowAggResults = sorted.groupBy(0, 1)
+              .aggregateWindows(aggCollectWithUnEqNulls.onColumn(3).overWindow(winOpts));
+               ColumnVector expected = ColumnVector.fromLists(
+                   new ListType(false, new BasicType(false, DType.INT32)),
+                   Arrays.asList(5), Arrays.asList(1, 5), Arrays.asList(1, 5), Arrays.asList(1),
+                   Arrays.asList(1, 4), Arrays.asList(1, 3, 4), Arrays.asList(3, 4), Arrays.asList(3, 4),
+                   Arrays.asList(null, null), Arrays.asList(6, null, null), Arrays.asList(6, 7, null), Arrays.asList(6, 7))) {
+            assertColumnsAreEqual(expected, windowAggResults.getColumn(0));
+          }
 
           // Primitive type: FLOAT64
-          //  a) excluding nulls
+          //  a) excluding NULLs
           try (Table windowAggResults = sorted.groupBy(0, 1)
               .aggregateWindows(aggCollect.onColumn(4).overWindow(winOpts));
                ColumnVector expected = ColumnVector.fromLists(
@@ -3105,7 +3117,7 @@ void testWindowingCollectSet() {
                    Arrays.asList(Double.NaN, Double.NaN), Arrays.asList(Double.NaN, Double.NaN))) {
             assertColumnsAreEqual(expected, windowAggResults.getColumn(0));
           }
-          //  b) including nulls AND nulls are equal
+          //  b) including NULLs AND NULLs are equal
           try (Table windowAggResults = sorted.groupBy(0, 1)
               .aggregateWindows(aggCollectWithEqNulls.onColumn(4).overWindow(winOpts));
                ColumnVector expected = ColumnVector.fromLists(
@@ -3117,6 +3129,18 @@ void testWindowingCollectSet() {
                    Arrays.asList(Double.NaN, Double.NaN, null), Arrays.asList(Double.NaN, Double.NaN))) {
             assertColumnsAreEqual(expected, windowAggResults.getColumn(0));
           }
+          //  c) including NULLs AND NULLs are equal AND NaNs are equal
+          try (Table windowAggResults = sorted.groupBy(0, 1)
+              .aggregateWindows(aggCollectWithEqNaNs.onColumn(4).overWindow(winOpts));
+               ColumnVector expected = ColumnVector.fromLists(
+                   new ListType(false, new BasicType(false, DType.FLOAT64)),
+                   Arrays.asList(1.1), Arrays.asList(1.1, null), Arrays.asList(1.1, 2.2, null), Arrays.asList(2.2, null),
+                   Arrays.asList(-3.0, 1.3e-7), Arrays.asList(-3.0, 1.3e-7),
+                   Arrays.asList(-3.0, 1.3e-7, Double.NaN), Arrays.asList(-3.0, Double.NaN),
+                   Arrays.asList(1e-3, null), Arrays.asList(1e-3, Double.NaN, null),
+                   Arrays.asList(Double.NaN, null), Arrays.asList(Double.NaN))) {
+            assertColumnsAreEqual(expected, windowAggResults.getColumn(0));
+          }
         }
       }
     }

From 854176b5a5824bb6286b15e6b847ca01091a24f7 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Mon, 7 Jun 2021 17:32:37 +0200
Subject: [PATCH 07/15] Update UCX-Py version to 0.20 (#8446)

Author:
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: https://github.com/rapidsai/cudf/pull/8446
---
 ci/gpu/build.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index d9f53f171ab..5f163f93410 100755
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -80,7 +80,7 @@ gpuci_conda_retry install -y \
                   "rapids-notebook-env=$MINOR_VERSION.*" \
                   "dask-cuda=${MINOR_VERSION}" \
                   "rmm=$MINOR_VERSION.*" \
-                  "ucx-py=${MINOR_VERSION}"
+                  "ucx-py=0.20.*"
 
 # https://docs.rapids.ai/maintainers/depmgmt/
 # gpuci_conda_retry remove --force rapids-build-env rapids-notebook-env

From ff1e8499c7461a445c714a4b9702fc239237d5e0 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 7 Jun 2021 15:47:27 -0400
Subject: [PATCH 08/15] Refactor setting stack size in regex code (#8358)

This PR is the first of several to cleanup the regex strings code. The regex code employs multiple stack sizes to perform its matching based on the number of instructions in the given regex pattern. The current implementation allocates the stack for the regex code. This PR moves this down into the regex functions themselves. This helps simplify the interface and reduce compile time a bit.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - https://github.com/nvdbaranec
  - Devavret Makkar (https://github.com/devavret)

URL: https://github.com/rapidsai/cudf/pull/8358
---
 cpp/src/strings/contains.cu            |  30 ++++---
 cpp/src/strings/extract.cu             |  16 ++--
 cpp/src/strings/findall.cu             |  30 ++++---
 cpp/src/strings/regex/regex.cuh        |  53 ++++++-------
 cpp/src/strings/regex/regex.inl        | 104 ++++++++++++-------------
 cpp/src/strings/regex/regexec.cu       |   6 +-
 cpp/src/strings/replace/backref_re.cu  |  14 +++-
 cpp/src/strings/replace/backref_re.cuh |  10 +--
 cpp/src/strings/replace/multi_re.cu    |  19 +++--
 cpp/src/strings/replace/replace_re.cu  |  61 ++++++++-------
 cpp/tests/strings/contains_tests.cpp   |  25 ++++++
 11 files changed, 213 insertions(+), 155 deletions(-)

diff --git a/cpp/src/strings/contains.cu b/cpp/src/strings/contains.cu
index 81a499084f6..082e6655cef 100644
--- a/cpp/src/strings/contains.cu
+++ b/cpp/src/strings/contains.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -54,13 +54,11 @@ struct contains_fn {
   __device__ bool operator()(size_type idx)
   {
     if (d_strings.is_null(idx)) return 0;
-    u_char data1[stack_size], data2[stack_size];
-    prog.set_stack_mem(data1, data2);
     string_view d_str = d_strings.element<string_view>(idx);
     int32_t begin     = 0;
     int32_t end       = bmatch ? 1  // match only the beginning of the string;
                          : -1;      // this handles empty strings too
-    return static_cast<bool>(prog.find(idx, d_str, begin, end));
+    return static_cast<bool>(prog.find<stack_size>(idx, d_str, begin, end));
   }
 };
 
@@ -91,7 +89,7 @@ std::unique_ptr<column> contains_util(
 
   // fill the output column
   int regex_insts = d_prog.insts_counts();
-  if ((regex_insts > MAX_STACK_INSTS) || (regex_insts <= RX_SMALL_INSTS))
+  if (regex_insts <= RX_SMALL_INSTS)
     thrust::transform(rmm::exec_policy(stream),
                       thrust::make_counting_iterator<size_type>(0),
                       thrust::make_counting_iterator<size_type>(strings_count),
@@ -103,12 +101,18 @@ std::unique_ptr<column> contains_util(
                       thrust::make_counting_iterator<size_type>(strings_count),
                       d_results,
                       contains_fn<RX_STACK_MEDIUM>{d_prog, d_column, beginning_only});
-  else
+  else if (regex_insts <= RX_LARGE_INSTS)
     thrust::transform(rmm::exec_policy(stream),
                       thrust::make_counting_iterator<size_type>(0),
                       thrust::make_counting_iterator<size_type>(strings_count),
                       d_results,
                       contains_fn<RX_STACK_LARGE>{d_prog, d_column, beginning_only});
+  else
+    thrust::transform(rmm::exec_policy(stream),
+                      thrust::make_counting_iterator<size_type>(0),
+                      thrust::make_counting_iterator<size_type>(strings_count),
+                      d_results,
+                      contains_fn<RX_STACK_ANY>{d_prog, d_column, beginning_only});
 
   results->set_null_count(strings.null_count());
   return results;
@@ -166,8 +170,6 @@ struct count_fn {
 
   __device__ int32_t operator()(unsigned int idx)
   {
-    u_char data1[stack_size], data2[stack_size];
-    prog.set_stack_mem(data1, data2);
     if (d_strings.is_null(idx)) return 0;
     string_view d_str  = d_strings.element<string_view>(idx);
     auto const nchars  = d_str.length();
@@ -175,7 +177,7 @@ struct count_fn {
     int32_t begin      = 0;
     while (begin < nchars) {
       auto end = static_cast<int32_t>(nchars);
-      if (prog.find(idx, d_str, begin, end) <= 0) break;
+      if (prog.find<stack_size>(idx, d_str, begin, end) <= 0) break;
       ++find_count;
       begin = end > begin ? end : begin + 1;
     }
@@ -210,7 +212,7 @@ std::unique_ptr<column> count_re(
 
   // fill the output column
   int regex_insts = d_prog.insts_counts();
-  if ((regex_insts > MAX_STACK_INSTS) || (regex_insts <= RX_SMALL_INSTS))
+  if (regex_insts <= RX_SMALL_INSTS)
     thrust::transform(rmm::exec_policy(stream),
                       thrust::make_counting_iterator<size_type>(0),
                       thrust::make_counting_iterator<size_type>(strings_count),
@@ -222,12 +224,18 @@ std::unique_ptr<column> count_re(
                       thrust::make_counting_iterator<size_type>(strings_count),
                       d_results,
                       count_fn<RX_STACK_MEDIUM>{d_prog, d_column});
-  else
+  else if (regex_insts <= RX_LARGE_INSTS)
     thrust::transform(rmm::exec_policy(stream),
                       thrust::make_counting_iterator<size_type>(0),
                       thrust::make_counting_iterator<size_type>(strings_count),
                       d_results,
                       count_fn<RX_STACK_LARGE>{d_prog, d_column});
+  else
+    thrust::transform(rmm::exec_policy(stream),
+                      thrust::make_counting_iterator<size_type>(0),
+                      thrust::make_counting_iterator<size_type>(strings_count),
+                      d_results,
+                      count_fn<RX_STACK_ANY>{d_prog, d_column});
 
   results->set_null_count(strings.null_count());
   return results;
diff --git a/cpp/src/strings/extract.cu b/cpp/src/strings/extract.cu
index f33c0c01fb6..438f031d3b8 100644
--- a/cpp/src/strings/extract.cu
+++ b/cpp/src/strings/extract.cu
@@ -50,15 +50,13 @@ struct extract_fn {
 
   __device__ string_index_pair operator()(size_type idx)
   {
-    u_char data1[stack_size], data2[stack_size];
-    prog.set_stack_mem(data1, data2);
     if (d_strings.is_null(idx)) return string_index_pair{nullptr, 0};
     string_view d_str = d_strings.element<string_view>(idx);
     string_index_pair result{nullptr, 0};
     int32_t begin = 0;
     int32_t end   = -1;  // handles empty strings automatically
-    if ((prog.find(idx, d_str, begin, end) > 0) &&
-        (prog.extract(idx, d_str, begin, end, column_index) > 0)) {
+    if ((prog.find<stack_size>(idx, d_str, begin, end) > 0) &&
+        (prog.extract<stack_size>(idx, d_str, begin, end, column_index) > 0)) {
       auto offset = d_str.byte_offset(begin);
       // build index-pair
       result = string_index_pair{d_str.data() + offset, d_str.byte_offset(end) - offset};
@@ -94,7 +92,7 @@ std::unique_ptr<table> extract(
   for (int32_t column_index = 0; column_index < groups; ++column_index) {
     rmm::device_uvector<string_index_pair> indices(strings_count, stream);
 
-    if ((regex_insts > MAX_STACK_INSTS) || (regex_insts <= RX_SMALL_INSTS))
+    if (regex_insts <= RX_SMALL_INSTS)
       thrust::transform(rmm::exec_policy(stream),
                         thrust::make_counting_iterator<size_type>(0),
                         thrust::make_counting_iterator<size_type>(strings_count),
@@ -106,12 +104,18 @@ std::unique_ptr<table> extract(
                         thrust::make_counting_iterator<size_type>(strings_count),
                         indices.begin(),
                         extract_fn<RX_STACK_MEDIUM>{d_prog, d_strings, column_index});
-    else
+    else if (regex_insts <= RX_LARGE_INSTS)
       thrust::transform(rmm::exec_policy(stream),
                         thrust::make_counting_iterator<size_type>(0),
                         thrust::make_counting_iterator<size_type>(strings_count),
                         indices.begin(),
                         extract_fn<RX_STACK_LARGE>{d_prog, d_strings, column_index});
+    else
+      thrust::transform(rmm::exec_policy(stream),
+                        thrust::make_counting_iterator<size_type>(0),
+                        thrust::make_counting_iterator<size_type>(strings_count),
+                        indices.begin(),
+                        extract_fn<RX_STACK_ANY>{d_prog, d_strings, column_index});
 
     results.emplace_back(make_strings_column(indices, stream, mr));
   }
diff --git a/cpp/src/strings/findall.cu b/cpp/src/strings/findall.cu
index bcd9c808271..3ab5b55020c 100644
--- a/cpp/src/strings/findall.cu
+++ b/cpp/src/strings/findall.cu
@@ -43,7 +43,7 @@ namespace {
  * @brief This functor handles extracting matched strings by applying the compiled regex pattern
  * and creating string_index_pairs for all the substrings.
  */
-template <size_t stack_size>
+template <int stack_size>
 struct findall_fn {
   column_device_view const d_strings;
   reprog_device prog;
@@ -64,17 +64,14 @@ struct findall_fn {
     string_index_pair result{nullptr, 0};
     if (d_strings.is_null(idx) || (d_counts && (column_index >= d_counts[idx])))
       return findall_result{0, result};
-    u_char data1[stack_size];
-    u_char data2[stack_size];
-    prog.set_stack_mem(data1, data2);
     string_view d_str      = d_strings.element<string_view>(idx);
     auto const nchars      = d_str.length();
     int32_t spos           = 0;
     int32_t epos           = static_cast<int32_t>(nchars);
     size_type column_count = 0;
     while (spos <= nchars) {
-      if (prog.find(idx, d_str, spos, epos) <= 0) break;  // no more matches found
-      if (column_count == column_index) break;            // found our column
+      if (prog.find<stack_size>(idx, d_str, spos, epos) <= 0) break;  // no more matches found
+      if (column_count == column_index) break;                        // found our column
       spos = epos > spos ? epos : spos + 1;
       epos = static_cast<int32_t>(nchars);
       ++column_count;
@@ -129,7 +126,7 @@ std::unique_ptr<table> findall_re(
   rmm::device_uvector<size_type> find_counts(strings_count, stream);
   auto d_find_counts = find_counts.data();
 
-  if ((regex_insts > MAX_STACK_INSTS) || (regex_insts <= RX_SMALL_INSTS))
+  if (regex_insts <= RX_SMALL_INSTS)
     thrust::transform(rmm::exec_policy(stream),
                       thrust::make_counting_iterator<size_type>(0),
                       thrust::make_counting_iterator<size_type>(strings_count),
@@ -141,12 +138,18 @@ std::unique_ptr<table> findall_re(
                       thrust::make_counting_iterator<size_type>(strings_count),
                       d_find_counts,
                       findall_count_fn<RX_STACK_MEDIUM>{*d_strings, *d_prog});
-  else
+  else if (regex_insts <= RX_LARGE_INSTS)
     thrust::transform(rmm::exec_policy(stream),
                       thrust::make_counting_iterator<size_type>(0),
                       thrust::make_counting_iterator<size_type>(strings_count),
                       d_find_counts,
                       findall_count_fn<RX_STACK_LARGE>{*d_strings, *d_prog});
+  else
+    thrust::transform(rmm::exec_policy(stream),
+                      thrust::make_counting_iterator<size_type>(0),
+                      thrust::make_counting_iterator<size_type>(strings_count),
+                      d_find_counts,
+                      findall_count_fn<RX_STACK_ANY>{*d_strings, *d_prog});
 
   std::vector<std::unique_ptr<column>> results;
 
@@ -167,7 +170,7 @@ std::unique_ptr<table> findall_re(
   for (int32_t column_index = 0; column_index < columns; ++column_index) {
     rmm::device_uvector<string_index_pair> indices(strings_count, stream);
 
-    if ((regex_insts > MAX_STACK_INSTS) || (regex_insts <= RX_SMALL_INSTS))
+    if (regex_insts <= RX_SMALL_INSTS)
       thrust::transform(
         rmm::exec_policy(stream),
         thrust::make_counting_iterator<size_type>(0),
@@ -181,13 +184,20 @@ std::unique_ptr<table> findall_re(
         thrust::make_counting_iterator<size_type>(strings_count),
         indices.begin(),
         findall_fn<RX_STACK_MEDIUM>{*d_strings, *d_prog, column_index, d_find_counts});
-    else
+    else if (regex_insts <= RX_LARGE_INSTS)
       thrust::transform(
         rmm::exec_policy(stream),
         thrust::make_counting_iterator<size_type>(0),
         thrust::make_counting_iterator<size_type>(strings_count),
         indices.begin(),
         findall_fn<RX_STACK_LARGE>{*d_strings, *d_prog, column_index, d_find_counts});
+    else
+      thrust::transform(rmm::exec_policy(stream),
+                        thrust::make_counting_iterator<size_type>(0),
+                        thrust::make_counting_iterator<size_type>(strings_count),
+                        indices.begin(),
+                        findall_fn<RX_STACK_ANY>{*d_strings, *d_prog, column_index, d_find_counts});
+
     //
     results.emplace_back(make_strings_column(indices.begin(), indices.end(), stream, mr));
   }
diff --git a/cpp/src/strings/regex/regex.cuh b/cpp/src/strings/regex/regex.cuh
index 6e03c183a8d..5e9811d6897 100644
--- a/cpp/src/strings/regex/regex.cuh
+++ b/cpp/src/strings/regex/regex.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -33,6 +33,24 @@ struct reljunk;
 struct reinst;
 class reprog;
 
+constexpr int32_t RX_STACK_SMALL  = 112;    ///< fastest stack size
+constexpr int32_t RX_STACK_MEDIUM = 1104;   ///< faster stack size
+constexpr int32_t RX_STACK_LARGE  = 10128;  ///< fast stack size
+constexpr int32_t RX_STACK_ANY    = 8;      ///< slowest: uses global memory
+
+/**
+ * @brief Mapping the number of instructions to device code stack memory size.
+ *
+ * ```
+ * 10128 ≈ 1000 instructions
+ * Formula is based on relist::data_size_for() calculation;
+ * Stack ≈ (8+2)*x + (x/8) = 10.125x < 11x  where x is number of instructions
+ * ```
+ */
+constexpr int32_t RX_SMALL_INSTS  = (RX_STACK_SMALL / 11);
+constexpr int32_t RX_MEDIUM_INSTS = (RX_STACK_MEDIUM / 11);
+constexpr int32_t RX_LARGE_INSTS  = (RX_STACK_LARGE / 11);
+
 /**
  * @brief Regex class stored on the device and executed by reprog_device.
  *
@@ -99,14 +117,7 @@ class reprog_device {
   /**
    * @brief Returns the number of regex groups found in the expression.
    */
-  int32_t group_counts() const { return _num_capturing_groups; }
-
-  /**
-   * @brief This sets up the memory used for keeping track of the regex progress.
-   *
-   * Call this for each string before calling find or extract.
-   */
-  __device__ inline void set_stack_mem(u_char* s1, u_char* s2);
+  __host__ __device__ inline int32_t group_counts() const { return _num_capturing_groups; }
 
   /**
    * @brief Returns the regex instruction object for a given index.
@@ -126,6 +137,7 @@ class reprog_device {
   /**
    * @brief Does a find evaluation using the compiled expression on the given string.
    *
+   * @tparam stack_size One of the `RX_STACK_` values based on the `insts_count`.
    * @param idx The string index used for mapping the state memory for this string in global memory
    * (if necessary).
    * @param d_str The string to search.
@@ -135,6 +147,7 @@ class reprog_device {
    * matching in the string.
    * @return Returns 0 if no match is found.
    */
+  template <int stack_size>
   __device__ inline int32_t find(int32_t idx,
                                  string_view const& d_str,
                                  int32_t& begin,
@@ -145,6 +158,7 @@ class reprog_device {
    *
    * This will find a specific match within the string when more than match occurs.
    *
+   * @tparam stack_size One of the `RX_STACK_` values based on the `insts_count`.
    * @param idx The string index used for mapping the state memory for this string in global memory
    * (if necessary).
    * @param d_str The string to search.
@@ -152,11 +166,12 @@ class reprog_device {
    * in the string.
    * @param[in,out] end Position index to end the search. If found, returns the last position
    * matching in the string.
-   * @param column The specific instance to return if more than one match is found.
+   * @param group_id The specific instance to return if more than one match is found.
    * @return Returns 0 if no match is found.
    */
+  template <int stack_size>
   __device__ inline int32_t extract(
-    int32_t idx, string_view const& d_str, int32_t& begin, int32_t& end, int32_t column);
+    int32_t idx, string_view const& d_str, int32_t& begin, int32_t& end, int32_t group_id);
 
  private:
   int32_t _startinst_id, _num_capturing_groups;
@@ -166,8 +181,6 @@ class reprog_device {
   int32_t* _startinst_ids{};          // array of start instruction ids
   reclass_device* _classes{};         // array of regex classes
   void* _relists_mem{};               // runtime relist memory for regexec
-  u_char* _stack_mem1{};              // memory for relist object 1
-  u_char* _stack_mem2{};              // memory for relist object 2
 
   /**
    * @brief Executes the regex pattern on the given string.
@@ -178,25 +191,13 @@ class reprog_device {
   /**
    * @brief Utility wrapper to setup state memory structures for calling regexec
    */
+  template <int stack_size>
   __device__ inline int32_t call_regexec(
     int32_t idx, string_view const& d_str, int32_t& begin, int32_t& end, int32_t groupid = 0);
 
   reprog_device(reprog&);  // must use create()
 };
 
-// 10128 ≈ 1000 instructions
-// Formula is based on relist::data_size_for() calculation;
-// Stack ≈ (8+2)*x + (x/8) = 10.125x < 11x  where x is number of instructions
-constexpr int32_t MAX_STACK_INSTS = 1000;
-
-constexpr int32_t RX_STACK_SMALL  = 112;
-constexpr int32_t RX_STACK_MEDIUM = 1104;
-constexpr int32_t RX_STACK_LARGE  = 10128;
-
-constexpr int32_t RX_SMALL_INSTS  = (RX_STACK_SMALL / 11);
-constexpr int32_t RX_MEDIUM_INSTS = (RX_STACK_MEDIUM / 11);
-constexpr int32_t RX_LARGE_INSTS  = (RX_STACK_LARGE / 11);
-
 }  // namespace detail
 }  // namespace strings
 }  // namespace cudf
diff --git a/cpp/src/strings/regex/regex.inl b/cpp/src/strings/regex/regex.inl
index 843dc9a7ca7..caa9550b9d1 100644
--- a/cpp/src/strings/regex/regex.inl
+++ b/cpp/src/strings/regex/regex.inl
@@ -38,12 +38,12 @@ namespace detail {
  * reflected here. The regexec function updates and manages this state data.
  */
 struct alignas(8) relist {
-  int16_t size;
-  int16_t listsize;
+  int16_t size{};
+  int16_t listsize{};
   int32_t reserved;
-  int2* ranges;       // pair per instruction
-  int16_t* inst_ids;  // one per instruction
-  u_char* mask;       // bit per instruction
+  int2* ranges{};       // pair per instruction
+  int16_t* inst_ids{};  // one per instruction
+  u_char* mask{};       // bit per instruction
 
   __host__ __device__ inline static int32_t data_size_for(int32_t insts)
   {
@@ -60,12 +60,10 @@ struct alignas(8) relist {
 
   __host__ __device__ inline relist() {}
 
-  __host__ __device__ inline void set_data(int16_t insts, u_char* data = nullptr)
+  __host__ __device__ inline relist(int16_t insts, u_char* data = nullptr) : listsize(insts)
   {
-    listsize    = insts;
-    u_char* ptr = (u_char*)data;
-    if (ptr == nullptr) ptr = (reinterpret_cast<u_char*>(this)) + sizeof(relist);
-    ranges = reinterpret_cast<int2*>(ptr);
+    auto ptr = data == nullptr ? reinterpret_cast<u_char*>(this) + sizeof(relist) : data;
+    ranges   = reinterpret_cast<int2*>(ptr);
     ptr += listsize * sizeof(ranges[0]);
     inst_ids = reinterpret_cast<int16_t*>(ptr);
     ptr += listsize * sizeof(inst_ids[0]);
@@ -111,8 +109,17 @@ struct alignas(8) relist {
 struct reljunk {
   relist* list1;
   relist* list2;
-  int32_t starttype;
-  char32_t startchar;
+  int32_t starttype{};
+  char32_t startchar{};
+
+  __host__ __device__ reljunk(relist* list1, relist* list2, int32_t stype, char32_t schar)
+    : list1(list1), list2(list2)
+  {
+    if (starttype == CHAR || starttype == BOL) {
+      starttype = stype;
+      startchar = schar;
+    }
+  }
 };
 
 __device__ inline void swaplist(relist*& l1, relist*& l2)
@@ -158,22 +165,6 @@ __device__ inline bool reclass_device::is_match(char32_t ch, const uint8_t* code
   return false;
 }
 
-/**
- * @brief Set the device data to be used for holding the state data of a string.
- *
- * With one thread per string, the stack is used to maintain state when evaluating the string.
- * With large regex patterns, the normal stack is not always practical.
- * This mechanism allows an alternate buffer of device memory to be used in place of the stack
- * for the state data.
- *
- * Two distinct buffers are required for the state data.
- */
-__device__ inline void reprog_device::set_stack_mem(u_char* s1, u_char* s2)
-{
-  _stack_mem1 = s1;
-  _stack_mem2 = s2;
-}
-
 __device__ inline reinst* reprog_device::get_inst(int32_t idx) const
 {
   assert((idx >= 0) && (idx < _insts_count));
@@ -370,52 +361,57 @@ __device__ inline int32_t reprog_device::regexec(
   return match;
 }
 
+template <int stack_size>
 __device__ inline int32_t reprog_device::find(int32_t idx,
                                               string_view const& dstr,
                                               int32_t& begin,
                                               int32_t& end)
 {
-  int32_t rtn = call_regexec(idx, dstr, begin, end);
+  int32_t rtn = call_regexec<stack_size>(idx, dstr, begin, end);
   if (rtn <= 0) begin = end = -1;
   return rtn;
 }
 
+template <int stack_size>
 __device__ inline int32_t reprog_device::extract(
   int32_t idx, string_view const& dstr, int32_t& begin, int32_t& end, int32_t group_id)
 {
   end = begin + 1;
-  return call_regexec(idx, dstr, begin, end, group_id + 1);
+  return call_regexec<stack_size>(idx, dstr, begin, end, group_id + 1);
 }
 
+template <int stack_size>
 __device__ inline int32_t reprog_device::call_regexec(
   int32_t idx, string_view const& dstr, int32_t& begin, int32_t& end, int32_t group_id)
 {
-  reljunk jnk;
-  jnk.starttype = 0;
-  jnk.startchar = 0;
-  int type      = get_inst(_startinst_id)->type;
-  if (type == CHAR || type == BOL) {
-    jnk.starttype = type;
-    jnk.startchar = get_inst(_startinst_id)->u1.c;
-  }
+  u_char data1[stack_size], data2[stack_size];
 
-  if (_relists_mem == 0) {
-    relist relist1;
-    relist relist2;
-    jnk.list1 = &relist1;
-    jnk.list2 = &relist2;
-    jnk.list1->set_data(static_cast<int16_t>(_insts_count), _stack_mem1);
-    jnk.list2->set_data(static_cast<int16_t>(_insts_count), _stack_mem2);
-    return regexec(dstr, jnk, begin, end, group_id);
-  }
+  auto const stype = get_inst(_startinst_id)->type;
+  auto const schar = get_inst(_startinst_id)->u1.c;
+
+  relist list1(static_cast<int16_t>(_insts_count), data1);
+  relist list2(static_cast<int16_t>(_insts_count), data2);
+
+  reljunk jnk(&list1, &list2, stype, schar);
+  return regexec(dstr, jnk, begin, end, group_id);
+}
+
+template <>
+__device__ inline int32_t reprog_device::call_regexec<RX_STACK_ANY>(
+  int32_t idx, string_view const& dstr, int32_t& begin, int32_t& end, int32_t group_id)
+{
+  auto const stype = get_inst(_startinst_id)->type;
+  auto const schar = get_inst(_startinst_id)->u1.c;
+
+  auto const relists_size = relist::alloc_size(_insts_count);
+  u_char* listmem         = reinterpret_cast<u_char*>(_relists_mem);  // beginning of relist buffer;
+  listmem += (idx * relists_size * 2);                                // two relist ptrs in reljunk:
+
+  // run ctor on assigned memory buffer
+  relist* list1 = new (listmem) relist(static_cast<int16_t>(_insts_count));
+  relist* list2 = new (listmem + relists_size) relist(static_cast<int16_t>(_insts_count));
 
-  auto relists_size = relist::alloc_size(_insts_count);
-  u_char* drel      = reinterpret_cast<u_char*>(_relists_mem);  // beginning of relist buffer;
-  drel += (idx * relists_size * 2);                             // two relist ptrs in reljunk:
-  jnk.list1 = reinterpret_cast<relist*>(drel);                  // - first one
-  jnk.list2 = reinterpret_cast<relist*>(drel + relists_size);   // - second one
-  jnk.list1->set_data(static_cast<int16_t>(_insts_count));      // essentially this is
-  jnk.list2->set_data(static_cast<int16_t>(_insts_count));      // substitute ctor call
+  reljunk jnk(list1, list2, stype, schar);
   return regexec(dstr, jnk, begin, end, group_id);
 }
 
diff --git a/cpp/src/strings/regex/regexec.cu b/cpp/src/strings/regex/regexec.cu
index b76e1932196..bd040eecaa6 100644
--- a/cpp/src/strings/regex/regexec.cu
+++ b/cpp/src/strings/regex/regexec.cu
@@ -68,9 +68,7 @@ reprog_device::reprog_device(reprog& prog)
     _insts_count{prog.insts_count()},
     _starts_count{prog.starts_count()},
     _classes_count{prog.classes_count()},
-    _relists_mem{nullptr},
-    _stack_mem1{nullptr},
-    _stack_mem2{nullptr}
+    _relists_mem{nullptr}
 {
 }
 
@@ -100,7 +98,7 @@ std::unique_ptr<reprog_device, std::function<void(reprog_device*)>> reprog_devic
   size_t memsize  = insts_size + startids_size + classes_size;
   size_t rlm_size = 0;
   // check memory size needed for executing regex
-  if (insts_count > MAX_STACK_INSTS) {
+  if (insts_count > RX_LARGE_INSTS) {
     auto relist_alloc_size = relist::alloc_size(insts_count);
     rlm_size               = relist_alloc_size * 2L * strings_count;  // reljunk has 2 relist ptrs
   }
diff --git a/cpp/src/strings/replace/backref_re.cu b/cpp/src/strings/replace/backref_re.cu
index 1ed1ee4d96f..07d5fefc264 100644
--- a/cpp/src/strings/replace/backref_re.cu
+++ b/cpp/src/strings/replace/backref_re.cu
@@ -114,19 +114,27 @@ std::unique_ptr<column> replace_with_backrefs(
   children_pair children = [&] {
     // Each invocation is predicated on the stack size
     // which is dependent on the number of regex instructions
-    if ((regex_insts > MAX_STACK_INSTS) || (regex_insts <= RX_SMALL_INSTS)) {
+    if (regex_insts <= RX_SMALL_INSTS) {
       return make_strings_children(
         backrefs_fn<BackRefIterator, RX_STACK_SMALL>{
           *d_strings, *d_prog, d_repl_template, backrefs.begin(), backrefs.end()},
         strings.size(),
         stream,
         mr);
-    } else if (regex_insts <= RX_MEDIUM_INSTS)
+    } else if (regex_insts <= RX_MEDIUM_INSTS) {
       return replace_with_backrefs_medium(
         *d_strings, *d_prog, d_repl_template, backrefs, stream, mr);
-    else
+    } else if (regex_insts <= RX_LARGE_INSTS) {
       return replace_with_backrefs_large(
         *d_strings, *d_prog, d_repl_template, backrefs, stream, mr);
+    } else {
+      return make_strings_children(
+        backrefs_fn<BackRefIterator, RX_STACK_ANY>{
+          *d_strings, *d_prog, d_repl_template, backrefs.begin(), backrefs.end()},
+        strings.size(),
+        stream,
+        mr);
+    }
   }();
 
   return make_strings_column(strings.size(),
diff --git a/cpp/src/strings/replace/backref_re.cuh b/cpp/src/strings/replace/backref_re.cuh
index d9ce887a689..9c14e5acaa9 100644
--- a/cpp/src/strings/replace/backref_re.cuh
+++ b/cpp/src/strings/replace/backref_re.cuh
@@ -42,7 +42,7 @@ using backref_type = thrust::pair<size_type, size_type>;
  * Small to medium instruction lengths can use the stack effectively though smaller executes faster.
  * Longer patterns require global memory. Shorter patterns are common in data cleaning.
  */
-template <typename Iterator, size_t stack_size>
+template <typename Iterator, int stack_size>
 struct backrefs_fn {
   column_device_view const d_strings;
   reprog_device prog;
@@ -58,9 +58,6 @@ struct backrefs_fn {
       if (!d_chars) d_offsets[idx] = 0;
       return;
     }
-    u_char data1[stack_size];
-    u_char data2[stack_size];
-    prog.set_stack_mem(data1, data2);
     auto const d_str  = d_strings.element<string_view>(idx);
     auto const nchars = d_str.length();      // number of characters in input string
     auto nbytes       = d_str.size_bytes();  // number of bytes in input string
@@ -70,7 +67,7 @@ struct backrefs_fn {
     size_type begin   = 0;       // first character position matching regex
     size_type end     = nchars;  // last character position (exclusive)
     // copy input to output replacing strings as we go
-    while (prog.find(idx, d_str, begin, end) > 0)  // inits the begin/end vars
+    while (prog.find<stack_size>(idx, d_str, begin, end) > 0)  // inits the begin/end vars
     {
       auto spos = d_str.byte_offset(begin);           // get offset for these
       auto epos = d_str.byte_offset(end);             // character position values
@@ -88,7 +85,8 @@ struct backrefs_fn {
           // extract the specific group's string for this backref's index
           int32_t spos_extract = begin;  // these are modified
           int32_t epos_extract = end;    // by extract()
-          if ((prog.extract(idx, d_str, spos_extract, epos_extract, backref.first - 1) <= 0) ||
+          if ((prog.extract<stack_size>(
+                 idx, d_str, spos_extract, epos_extract, backref.first - 1) <= 0) ||
               (epos_extract <= spos_extract))
             return;  // no value for this backref number; that is ok
           spos_extract = d_str.byte_offset(spos_extract);  // convert
diff --git a/cpp/src/strings/replace/multi_re.cu b/cpp/src/strings/replace/multi_re.cu
index 2672dc4fb7a..a59401db24f 100644
--- a/cpp/src/strings/replace/multi_re.cu
+++ b/cpp/src/strings/replace/multi_re.cu
@@ -51,7 +51,7 @@ using found_range = thrust::pair<size_type, size_type>;
  * Small to medium instruction lengths can use the stack effectively though smaller executes faster.
  * Longer patterns require global memory. Shorter patterns are common in data cleaning.
  */
-template <size_t stack_size>
+template <int stack_size>
 struct replace_multi_regex_fn {
   column_device_view const d_strings;
   reprog_device* progs;  // array of regex progs
@@ -67,8 +67,6 @@ struct replace_multi_regex_fn {
       if (!d_chars) d_offsets[idx] = 0;
       return;
     }
-    u_char data1[stack_size];
-    u_char data2[stack_size];
     auto const d_str      = d_strings.element<string_view>(idx);
     auto const nchars     = d_str.length();      // number of characters in input string
     auto nbytes           = d_str.size_bytes();  // number of bytes in input string
@@ -87,10 +85,10 @@ struct replace_multi_regex_fn {
         if (d_ranges[ptn_idx].first >= ch_pos)  // previously matched here
           continue;                             // or later in the string
         reprog_device prog = progs[ptn_idx];
-        prog.set_stack_mem(data1, data2);
+
         auto begin = static_cast<int32_t>(ch_pos);
         auto end   = static_cast<int32_t>(nchars);
-        if (!prog.is_empty() && prog.find(idx, d_str, begin, end) > 0)
+        if (!prog.is_empty() && prog.find<stack_size>(idx, d_str, begin, end) > 0)
           d_ranges[ptn_idx] = found_range{begin, end};  // found a match
         else
           d_ranges[ptn_idx] = found_range{nchars, nchars};  // this pattern is done
@@ -176,7 +174,7 @@ std::unique_ptr<column> replace_re(
   auto children = [&] {
     // Each invocation is predicated on the stack size which is dependent on the number of regex
     // instructions
-    if ((regex_insts > MAX_STACK_INSTS) || (regex_insts <= RX_SMALL_INSTS))
+    if (regex_insts <= RX_SMALL_INSTS)
       return make_strings_children(
         replace_multi_regex_fn<RX_STACK_SMALL>{
           *d_strings, d_progs, static_cast<size_type>(progs.size()), d_found_ranges, *d_repls},
@@ -190,13 +188,20 @@ std::unique_ptr<column> replace_re(
         strings_count,
         stream,
         mr);
-    else
+    else if (regex_insts <= RX_LARGE_INSTS)
       return make_strings_children(
         replace_multi_regex_fn<RX_STACK_LARGE>{
           *d_strings, d_progs, static_cast<size_type>(progs.size()), d_found_ranges, *d_repls},
         strings_count,
         stream,
         mr);
+    else
+      return make_strings_children(
+        replace_multi_regex_fn<RX_STACK_ANY>{
+          *d_strings, d_progs, static_cast<size_type>(progs.size()), d_found_ranges, *d_repls},
+        strings_count,
+        stream,
+        mr);
   }();
 
   return make_strings_column(strings_count,
diff --git a/cpp/src/strings/replace/replace_re.cu b/cpp/src/strings/replace/replace_re.cu
index a4606a599bb..9468e80fa1c 100644
--- a/cpp/src/strings/replace/replace_re.cu
+++ b/cpp/src/strings/replace/replace_re.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -48,7 +48,7 @@ namespace {
  * Small to medium instruction lengths can use the stack effectively though smaller executes faster.
  * Longer patterns require global memory. Shorter patterns are common in data cleaning.
  */
-template <size_t stack_size>
+template <int stack_size>
 struct replace_regex_fn {
   column_device_view const d_strings;
   reprog_device prog;
@@ -63,9 +63,6 @@ struct replace_regex_fn {
       if (!d_chars) d_offsets[idx] = 0;
       return;
     }
-    u_char data1[stack_size];
-    u_char data2[stack_size];
-    prog.set_stack_mem(data1, data2);
     auto const d_str  = d_strings.element<string_view>(idx);
     auto const nchars = d_str.length();                  // number of characters in input string
     auto nbytes       = d_str.size_bytes();              // number of bytes in input string
@@ -78,8 +75,9 @@ struct replace_regex_fn {
     // copy input to output replacing strings as we go
     while (mxn-- > 0)  // maximum number of replaces
     {
-      if (prog.is_empty() || prog.find(idx, d_str, begin, end) <= 0) break;  // no more matches
-      auto spos = d_str.byte_offset(begin);                                  // get offset for these
+      if (prog.is_empty() || prog.find<stack_size>(idx, d_str, begin, end) <= 0)
+        break;                                        // no more matches
+      auto spos = d_str.byte_offset(begin);           // get offset for these
       auto epos = d_str.byte_offset(end);             // character position values
       nbytes += d_repl.size_bytes() - (epos - spos);  // compute new size
       if (out_ptr)                                    // replace
@@ -128,27 +126,34 @@ std::unique_ptr<column> replace_re(
   auto null_count = strings.null_count();
 
   // create child columns
-  std::pair<std::unique_ptr<column>, std::unique_ptr<column>> children(nullptr, nullptr);
-  // Each invocation is predicated on the stack size which is dependent on the number of regex
-  // instructions
-  if ((regex_insts > MAX_STACK_INSTS) || (regex_insts <= RX_SMALL_INSTS))
-    children =
-      make_strings_children(replace_regex_fn<RX_STACK_SMALL>{d_strings, d_prog, d_repl, maxrepl},
-                            strings_count,
-                            stream,
-                            mr);
-  else if (regex_insts <= RX_MEDIUM_INSTS)
-    children =
-      make_strings_children(replace_regex_fn<RX_STACK_MEDIUM>{d_strings, d_prog, d_repl, maxrepl},
-                            strings_count,
-                            stream,
-                            mr);
-  else
-    children =
-      make_strings_children(replace_regex_fn<RX_STACK_LARGE>{d_strings, d_prog, d_repl, maxrepl},
-                            strings_count,
-                            stream,
-                            mr);
+  auto children = [&] {
+    // Each invocation is predicated on the stack size which is dependent on the number of regex
+    // instructions
+    if (regex_insts <= RX_SMALL_INSTS)
+      return make_strings_children(
+        replace_regex_fn<RX_STACK_SMALL>{d_strings, d_prog, d_repl, maxrepl},
+        strings_count,
+        stream,
+        mr);
+    else if (regex_insts <= RX_MEDIUM_INSTS)
+      return make_strings_children(
+        replace_regex_fn<RX_STACK_MEDIUM>{d_strings, d_prog, d_repl, maxrepl},
+        strings_count,
+        stream,
+        mr);
+    else if (regex_insts <= RX_LARGE_INSTS)
+      return make_strings_children(
+        replace_regex_fn<RX_STACK_LARGE>{d_strings, d_prog, d_repl, maxrepl},
+        strings_count,
+        stream,
+        mr);
+    else
+      return make_strings_children(
+        replace_regex_fn<RX_STACK_ANY>{d_strings, d_prog, d_repl, maxrepl},
+        strings_count,
+        stream,
+        mr);
+  }();
 
   return make_strings_column(strings_count,
                              std::move(children.first),
diff --git a/cpp/tests/strings/contains_tests.cpp b/cpp/tests/strings/contains_tests.cpp
index e86c344542f..ddd6fc9e1dc 100644
--- a/cpp/tests/strings/contains_tests.cpp
+++ b/cpp/tests/strings/contains_tests.cpp
@@ -372,3 +372,28 @@ TEST_F(StringsContainsTests, LargeRegex)
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
 }
+
+TEST_F(StringsContainsTests, ExtraLargeRegex)
+{
+  // This results in ~950 regex instructions which is above the 'large' range.
+  std::string data(950, '0');
+  cudf::test::strings_column_wrapper strings({data, data, data, data, data, "00"});
+  std::string pattern = data;
+
+  auto strings_view = cudf::strings_column_view(strings);
+  {
+    auto results = cudf::strings::contains_re(strings_view, pattern);
+    cudf::test::fixed_width_column_wrapper<bool> expected({true, true, true, true, true, false});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  }
+  {
+    auto results = cudf::strings::matches_re(strings_view, pattern);
+    cudf::test::fixed_width_column_wrapper<bool> expected({true, true, true, true, true, false});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  }
+  {
+    auto results = cudf::strings::count_re(strings_view, pattern);
+    cudf::test::fixed_width_column_wrapper<int32_t> expected({1, 1, 1, 1, 1, 0});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  }
+}

From 92ed5b3bfe3b67d384f62cf81415e1767a7a19b6 Mon Sep 17 00:00:00 2001
From: Sheilah Kirui <71867292+skirui-source@users.noreply.github.com>
Date: Mon, 7 Jun 2021 13:23:05 -0700
Subject: [PATCH 09/15] Implement `.describe() ` for `DataFrameGroupBy` (#8179)

This PR implements  functionality  to generate summary statistics for ` Dataframe.groupby() ` operation
via `.describe() ` method, similar to Pandas.


```
>>> import pandas as pd
>>> pdf = pd.DataFrame({"Speed": [380.0, 370.0, 24.0, 26.0], "Score": [50, 30, 90, 80]})
>>> pdf
   Speed  Score
0  380.0     50
1  370.0     30
2   24.0     90
3   26.0     80
>>> pdf.groupby('Score').describe()
                                                    Speed
      count   mean std    min    25%    50%    75%    max
Score
30      1.0  370.0 NaN  370.0  370.0  370.0  370.0  370.0
50      1.0  380.0 NaN  380.0  380.0  380.0  380.0  380.0
80      1.0   26.0 NaN   26.0   26.0   26.0   26.0   26.0
90      1.0   24.0 NaN   24.0   24.0   24.0   24.0   24.0


>>> import cudf
>>> gdf = cudf.from_pandas(pdf)
>>> gdf.groupby('Score').describe()
       count   mean   std    min    25%    50%    75%    max
Score
30         1  370.0  <NA>  370.0  370.0  370.0  370.0  370.0
50         1  380.0  <NA>  380.0  380.0  380.0  380.0  380.0
80         1   26.0  <NA>   26.0   26.0   26.0   26.0   26.0
90         1   24.0  <NA>   24.0   24.0   24.0   24.0   24.0

```


Fixes: https://github.com/rapidsai/cudf/issues/7990

Authors:
  - Sheilah Kirui (https://github.com/skirui-source)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Ashwin Srinath (https://github.com/shwina)
  - Michael Wang (https://github.com/isVoid)
  - Christopher Harris (https://github.com/cwharris)

URL: https://github.com/rapidsai/cudf/pull/8179
---
 python/cudf/cudf/core/groupby/groupby.py | 84 ++++++++++++++++++++++++
 python/cudf/cudf/tests/test_groupby.py   | 22 +++++++
 2 files changed, 106 insertions(+)

diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 43476b4b781..17104076ebd 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -14,6 +14,21 @@
 from cudf.utils.utils import GetAttrGetItemMixin, cached_property
 
 
+# The three functions below return the quantiles [25%, 50%, 75%]
+# respectively, which are called in the describe() method to ouput
+# the summary stats of a GroupBy object
+def _quantile_25(x):
+    return x.quantile(0.25)
+
+
+def _quantile_50(x):
+    return x.quantile(0.50)
+
+
+def _quantile_75(x):
+    return x.quantile(0.75)
+
+
 # Note that all valid aggregation methods (e.g. GroupBy.min) are bound to the
 # class after its definition (see below).
 class GroupBy(Serializable):
@@ -601,6 +616,75 @@ def func(x):
 
         return self.agg(func)
 
+    def describe(self, include=None, exclude=None):
+        """
+        Generate descriptive statistics that summarizes the central tendency,
+        dispersion and shape of a dataset’s distribution, excluding NaN values.
+
+        Analyzes numeric DataFrames only
+
+        Parameters
+        ----------
+        include: ‘all’, list-like of dtypes or None (default), optional
+            list of data types to include in the result.
+            Ignored for Series.
+
+        exclude: list-like of dtypes or None (default), optional,
+            list of data types to omit from the result.
+            Ignored for Series.
+
+        Returns
+        -------
+        Series or DataFrame
+            Summary statistics of the Dataframe provided.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> gdf = cudf.DataFrame({"Speed": [380.0, 370.0, 24.0, 26.0],
+                                  "Score": [50, 30, 90, 80]})
+        >>> gdf
+        Speed  Score
+        0  380.0     50
+        1  370.0     30
+        2   24.0     90
+        3   26.0     80
+        >>> gdf.groupby('Score').describe()
+            Speed
+            count   mean   std    min    25%    50%    75%     max
+        Score
+        30        1  370.0  <NA>  370.0  370.0  370.0  370.0  370.0
+        50        1  380.0  <NA>  380.0  380.0  380.0  380.0  380.0
+        80        1   26.0  <NA>   26.0   26.0   26.0   26.0   26.0
+        90        1   24.0  <NA>   24.0   24.0   24.0   24.0   24.0
+
+        """
+        if exclude is not None and include is not None:
+            raise NotImplementedError
+
+        res = self.agg(
+            [
+                "count",
+                "mean",
+                "std",
+                "min",
+                _quantile_25,
+                _quantile_50,
+                _quantile_75,
+                "max",
+            ]
+        )
+        res.rename(
+            columns={
+                "_quantile_25": "25%",
+                "_quantile_50": "50%",
+                "_quantile_75": "75%",
+            },
+            level=1,
+            inplace=True,
+        )
+        return res
+
     def sum(self):
         """Compute the column-wise sum of the values in each group."""
         return self.agg("sum")
diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index e774bda4914..6ba2354d5d5 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -1901,3 +1901,25 @@ def test_groupby_shift_row_zero_shift(nelem, fill_value):
     assert_groupby_results_equal(
         expected[["1", "2", "3", "4"]], got[["1", "2", "3", "4"]]
     )
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        {"Speed": [380.0, 370.0, 24.0, 26.0], "Score": [50, 30, 90, 80]},
+        {
+            "Speed": [380.0, 370.0, 24.0, 26.0],
+            "Score": [50, 30, 90, 80],
+            "Other": [10, 20, 30, 40],
+        },
+    ],
+)
+@pytest.mark.parametrize("group", ["Score", "Speed"])
+def test_groupby_describe(data, group):
+    pdf = pd.DataFrame(data)
+    gdf = cudf.from_pandas(pdf)
+
+    got = gdf.groupby(group).describe()
+    expect = pdf.groupby(group).describe()
+
+    assert_groupby_results_equal(expect, got, check_dtype=False)

From badb5011e40f36457bf01167357e78c16defd499 Mon Sep 17 00:00:00 2001
From: Nghia Truong <ttnghia@users.noreply.github.com>
Date: Mon, 7 Jun 2021 14:50:57 -0600
Subject: [PATCH 10/15] Replace `all_null()` and `all_valid()` by
 `iterator_all_nulls()` and `iterator_no_null()` in tests (#8437)

This PR does some cleanup for tests in copying and groupby. In particular, it replace the functions `all_null()` and `all_valid()` by `iterator_all_nulls()` and `iterator_no_null()` from `iterator_utilities.hpp`. This is because the former functions (`all_null()` and `all_valid()`) are just duplicated/reimplemented of the latter ones.

There is no other change in this work.

Authors:
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - MithunR (https://github.com/mythrocks)
  - Ram (Ramakrishna Prabhu) (https://github.com/rgsl888prabhu)

URL: https://github.com/rapidsai/cudf/pull/8437
---
 cpp/tests/copying/get_value_tests.cpp      | 14 ++++++------
 cpp/tests/groupby/argmax_tests.cpp         | 13 +++++------
 cpp/tests/groupby/argmin_tests.cpp         | 13 +++++------
 cpp/tests/groupby/count_scan_tests.cpp     |  7 +++---
 cpp/tests/groupby/count_tests.cpp          |  7 +++---
 cpp/tests/groupby/groupby_test_util.hpp    | 12 -----------
 cpp/tests/groupby/groups_tests.cpp         |  5 +++--
 cpp/tests/groupby/keys_tests.cpp           |  7 +++---
 cpp/tests/groupby/max_scan_tests.cpp       |  9 ++++----
 cpp/tests/groupby/max_tests.cpp            | 13 +++++------
 cpp/tests/groupby/mean_tests.cpp           |  9 ++++----
 cpp/tests/groupby/median_tests.cpp         | 13 +++++------
 cpp/tests/groupby/min_scan_tests.cpp       |  9 ++++----
 cpp/tests/groupby/min_tests.cpp            | 13 +++++------
 cpp/tests/groupby/nth_element_tests.cpp    | 15 +++++++------
 cpp/tests/groupby/nunique_tests.cpp        | 13 +++++------
 cpp/tests/groupby/product_tests.cpp        | 15 +++++++------
 cpp/tests/groupby/quantile_tests.cpp       | 25 +++++++++++-----------
 cpp/tests/groupby/replace_nulls_tests.cpp  | 21 +++++++++++-------
 cpp/tests/groupby/std_tests.cpp            | 15 +++++++------
 cpp/tests/groupby/sum_of_squares_tests.cpp | 13 +++++------
 cpp/tests/groupby/sum_scan_tests.cpp       |  9 ++++----
 cpp/tests/groupby/sum_tests.cpp            |  9 ++++----
 cpp/tests/groupby/var_tests.cpp            | 15 +++++++------
 cpp/tests/replace/replace_nulls_tests.cpp  | 15 +++++++------
 25 files changed, 162 insertions(+), 147 deletions(-)

diff --git a/cpp/tests/copying/get_value_tests.cpp b/cpp/tests/copying/get_value_tests.cpp
index 40dc07512eb..9645ee3d458 100644
--- a/cpp/tests/copying/get_value_tests.cpp
+++ b/cpp/tests/copying/get_value_tests.cpp
@@ -26,6 +26,7 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/cudf_gtest.hpp>
+#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_list_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
@@ -560,8 +561,8 @@ struct ListGetStructValueTest : public BaseFixture {
     // {int: 1, string: NULL, list: NULL}
     return this->make_test_structs_column({{1}, {1}},
                                           strings_column_wrapper({"aa"}, {false}),
-                                          LCWinner_t({{}}, all_invalid()),
-                                          all_valid());
+                                          LCWinner_t({{}}, iterator_all_nulls()),
+                                          iterator_no_null());
   }
 
   /**
@@ -570,7 +571,7 @@ struct ListGetStructValueTest : public BaseFixture {
   SCW row1()
   {
     // NULL
-    return this->make_test_structs_column({-1}, {""}, LCWinner_t{-1}, all_invalid());
+    return this->make_test_structs_column({-1}, {""}, LCWinner_t{-1}, iterator_all_nulls());
   }
 
   /**
@@ -581,8 +582,8 @@ struct ListGetStructValueTest : public BaseFixture {
     // {int: 3, string: "xyz", list: [3, 8, 4]}
     return this->make_test_structs_column({{3}, {1}},
                                           strings_column_wrapper({"xyz"}, {true}),
-                                          LCWinner_t({{3, 8, 4}}, all_valid()),
-                                          all_valid());
+                                          LCWinner_t({{3, 8, 4}}, iterator_no_null()),
+                                          iterator_no_null());
   }
 
   /**
@@ -596,9 +597,6 @@ struct ListGetStructValueTest : public BaseFixture {
     // {int: 3, string: "xyz", list: [3, 8, 4]}
     return this->concat({row0(), row1(), row2()});
   }
-
-  auto all_valid() { return thrust::make_constant_iterator(true); }
-  auto all_invalid() { return thrust::make_constant_iterator(false); }
 };
 
 TYPED_TEST_CASE(ListGetStructValueTest, FixedWidthTypes);
diff --git a/cpp/tests/groupby/argmax_tests.cpp b/cpp/tests/groupby/argmax_tests.cpp
index d43de574671..12d083b3651 100644
--- a/cpp/tests/groupby/argmax_tests.cpp
+++ b/cpp/tests/groupby/argmax_tests.cpp
@@ -18,6 +18,7 @@
 
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/detail/aggregation/aggregation.hpp>
@@ -58,7 +59,7 @@ TYPED_TEST(groupby_argmax_test, zero_valid_keys)
 
   if (std::is_same<V, bool>::value) return;
 
-  fixed_width_column_wrapper<K> keys({1, 2, 3}, all_null());
+  fixed_width_column_wrapper<K> keys({1, 2, 3}, iterator_all_nulls());
   fixed_width_column_wrapper<V> vals({3, 4, 5});
 
   fixed_width_column_wrapper<K> expect_keys{};
@@ -79,10 +80,10 @@ TYPED_TEST(groupby_argmax_test, zero_valid_values)
   if (std::is_same<V, bool>::value) return;
 
   fixed_width_column_wrapper<K> keys{1, 1, 1};
-  fixed_width_column_wrapper<V> vals({3, 4, 5}, all_null());
+  fixed_width_column_wrapper<V> vals({3, 4, 5}, iterator_all_nulls());
 
   fixed_width_column_wrapper<K> expect_keys{1};
-  fixed_width_column_wrapper<R> expect_vals({0}, all_null());
+  fixed_width_column_wrapper<R> expect_vals({0}, iterator_all_nulls());
 
   auto agg = cudf::make_argmax_aggregation();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
@@ -104,7 +105,7 @@ TYPED_TEST(groupby_argmax_test, null_keys_and_values)
                                      {0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0});
 
   //  {1, 1,     2, 2, 2,   3, 3,    4}
-  fixed_width_column_wrapper<K> expect_keys({1, 2, 3, 4}, all_valid());
+  fixed_width_column_wrapper<K> expect_keys({1, 2, 3, 4}, iterator_no_null());
   //  {6, 3,     5, 4, 0,   2, 1,    -}
   fixed_width_column_wrapper<R> expect_vals({3, 4, 7, 0}, {1, 1, 1, 0});
 
@@ -142,10 +143,10 @@ TEST_F(groupby_argmax_string_test, zero_valid_values)
   using R = cudf::detail::target_type_t<V, aggregation::ARGMAX>;
 
   fixed_width_column_wrapper<K> keys{1, 1, 1};
-  strings_column_wrapper vals({"año", "bit", "₹1"}, all_null());
+  strings_column_wrapper vals({"año", "bit", "₹1"}, iterator_all_nulls());
 
   fixed_width_column_wrapper<K> expect_keys{1};
-  fixed_width_column_wrapper<R> expect_vals({0}, all_null());
+  fixed_width_column_wrapper<R> expect_vals({0}, iterator_all_nulls());
 
   auto agg = cudf::make_argmax_aggregation();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
diff --git a/cpp/tests/groupby/argmin_tests.cpp b/cpp/tests/groupby/argmin_tests.cpp
index 18ff0f8fef5..1c4e0bdd737 100644
--- a/cpp/tests/groupby/argmin_tests.cpp
+++ b/cpp/tests/groupby/argmin_tests.cpp
@@ -18,6 +18,7 @@
 
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/detail/aggregation/aggregation.hpp>
@@ -58,7 +59,7 @@ TYPED_TEST(groupby_argmin_test, zero_valid_keys)
 
   if (std::is_same<V, bool>::value) return;
 
-  fixed_width_column_wrapper<K> keys({1, 2, 3}, all_null());
+  fixed_width_column_wrapper<K> keys({1, 2, 3}, iterator_all_nulls());
   fixed_width_column_wrapper<V> vals({3, 4, 5});
 
   fixed_width_column_wrapper<K> expect_keys{};
@@ -79,10 +80,10 @@ TYPED_TEST(groupby_argmin_test, zero_valid_values)
   if (std::is_same<V, bool>::value) return;
 
   fixed_width_column_wrapper<K> keys{1, 1, 1};
-  fixed_width_column_wrapper<V> vals({3, 4, 5}, all_null());
+  fixed_width_column_wrapper<V> vals({3, 4, 5}, iterator_all_nulls());
 
   fixed_width_column_wrapper<K> expect_keys{1};
-  fixed_width_column_wrapper<R> expect_vals({0}, all_null());
+  fixed_width_column_wrapper<R> expect_vals({0}, iterator_all_nulls());
 
   auto agg = cudf::make_argmin_aggregation();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
@@ -104,7 +105,7 @@ TYPED_TEST(groupby_argmin_test, null_keys_and_values)
                                      {1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0});
 
   //  { 1, 1,     2, 2, 2,   3, 3,    4}
-  fixed_width_column_wrapper<K> expect_keys({1, 2, 3, 4}, all_valid());
+  fixed_width_column_wrapper<K> expect_keys({1, 2, 3, 4}, iterator_no_null());
   //  { 9, 6,     8, 5, 0,   7, 1,    -}
   fixed_width_column_wrapper<R> expect_vals({3, 9, 8, 0}, {1, 1, 1, 0});
 
@@ -143,10 +144,10 @@ TEST_F(groupby_argmin_string_test, zero_valid_values)
   using R = cudf::detail::target_type_t<V, aggregation::ARGMIN>;
 
   fixed_width_column_wrapper<K> keys{1, 1, 1};
-  strings_column_wrapper vals({"año", "bit", "₹1"}, all_null());
+  strings_column_wrapper vals({"año", "bit", "₹1"}, iterator_all_nulls());
 
   fixed_width_column_wrapper<K> expect_keys{1};
-  fixed_width_column_wrapper<R> expect_vals({0}, all_null());
+  fixed_width_column_wrapper<R> expect_vals({0}, iterator_all_nulls());
 
   auto agg = cudf::make_argmin_aggregation();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
diff --git a/cpp/tests/groupby/count_scan_tests.cpp b/cpp/tests/groupby/count_scan_tests.cpp
index b7b18982f51..4543b418474 100644
--- a/cpp/tests/groupby/count_scan_tests.cpp
+++ b/cpp/tests/groupby/count_scan_tests.cpp
@@ -18,6 +18,7 @@
 
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/detail/aggregation/aggregation.hpp>
@@ -84,7 +85,7 @@ TYPED_TEST(groupby_count_scan_test, zero_valid_keys)
   using result_wrapper = typename TestFixture::result_wrapper;
 
   // clang-format off
-  key_wrapper keys( {1, 2, 3}, all_null());
+  key_wrapper keys( {1, 2, 3}, iterator_all_nulls());
   value_wrapper vals{3, 4, 5};
 
   key_wrapper expect_keys{};
@@ -102,7 +103,7 @@ TYPED_TEST(groupby_count_scan_test, zero_valid_values)
 
   // clang-format off
   key_wrapper keys   {1, 1, 1};
-  value_wrapper vals({3, 4, 5}, all_null());
+  value_wrapper vals({3, 4, 5}, iterator_all_nulls());
 
   key_wrapper expect_keys{1, 1, 1};
   result_wrapper expect_vals{0, 1, 2};
@@ -122,7 +123,7 @@ TYPED_TEST(groupby_count_scan_test, null_keys_and_values)
   value_wrapper vals({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4}, {0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0});
 
   //                        {1, 1, 1, 2, 2, 2, 2, 3, _, 3, 4}
-  key_wrapper expect_keys(  {1, 1, 1, 2, 2, 2, 2, 3,    3, 4}, all_valid());
+  key_wrapper expect_keys(  {1, 1, 1, 2, 2, 2, 2, 3,    3, 4}, iterator_no_null());
   //                        {0, 3, 6, 1, 4, _, 9, 2, 7, 8, -}
   result_wrapper expect_vals{0, 1, 2, 0, 1,    2, 3, 0, 1, 0};
   // clang-format on
diff --git a/cpp/tests/groupby/count_tests.cpp b/cpp/tests/groupby/count_tests.cpp
index f3d73a0fe30..b5bf8875726 100644
--- a/cpp/tests/groupby/count_tests.cpp
+++ b/cpp/tests/groupby/count_tests.cpp
@@ -18,6 +18,7 @@
 
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/detail/aggregation/aggregation.hpp>
@@ -75,7 +76,7 @@ TYPED_TEST(groupby_count_test, zero_valid_keys)
   using V = TypeParam;
   using R = cudf::detail::target_type_t<V, aggregation::COUNT_VALID>;
 
-  fixed_width_column_wrapper<K> keys({1, 2, 3}, all_null());
+  fixed_width_column_wrapper<K> keys({1, 2, 3}, iterator_all_nulls());
   fixed_width_column_wrapper<V> vals{3, 4, 5};
 
   fixed_width_column_wrapper<K> expect_keys{};
@@ -97,7 +98,7 @@ TYPED_TEST(groupby_count_test, zero_valid_values)
   using R = cudf::detail::target_type_t<V, aggregation::COUNT_VALID>;
 
   fixed_width_column_wrapper<K> keys{1, 1, 1};
-  fixed_width_column_wrapper<V> vals({3, 4, 5}, all_null());
+  fixed_width_column_wrapper<V> vals({3, 4, 5}, iterator_all_nulls());
 
   fixed_width_column_wrapper<K> expect_keys{1};
   fixed_width_column_wrapper<R> expect_vals{0};
@@ -125,7 +126,7 @@ TYPED_TEST(groupby_count_test, null_keys_and_values)
 
   // clang-format off
   //                                        {1, 1,     2, 2, 2,   3, 3,    4}
-  fixed_width_column_wrapper<K> expect_keys({1,        2,         3,       4}, all_valid());
+  fixed_width_column_wrapper<K> expect_keys({1,        2,         3,       4}, iterator_no_null());
   //                                        {3, 6,     1, 4, 9,   2, 8,    -}
   fixed_width_column_wrapper<R> expect_vals({2,        3,         2,       0});
   // clang-format on
diff --git a/cpp/tests/groupby/groupby_test_util.hpp b/cpp/tests/groupby/groupby_test_util.hpp
index c7e27cd6367..12d46cebb1c 100644
--- a/cpp/tests/groupby/groupby_test_util.hpp
+++ b/cpp/tests/groupby/groupby_test_util.hpp
@@ -125,17 +125,5 @@ inline void test_single_scan(column_view const& keys,
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expect_vals, *result.second[0].results[0], true);
 }
 
-inline auto all_valid()
-{
-  auto all_valid = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
-  return all_valid;
-}
-
-inline auto all_null()
-{
-  auto all_null = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return false; });
-  return all_null;
-}
-
 }  // namespace test
 }  // namespace cudf
diff --git a/cpp/tests/groupby/groups_tests.cpp b/cpp/tests/groupby/groups_tests.cpp
index 9281e4081c2..7111a2ee22b 100644
--- a/cpp/tests/groupby/groups_tests.cpp
+++ b/cpp/tests/groupby/groups_tests.cpp
@@ -18,6 +18,7 @@
 
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/types.hpp>
@@ -57,7 +58,7 @@ TEST_F(groupby_group_keys_test, all_null_keys)
 {
   using K = int32_t;
 
-  fixed_width_column_wrapper<K> keys({1, 1, 2, 3, 1, 2}, all_null());
+  fixed_width_column_wrapper<K> keys({1, 1, 2, 3, 1, 2}, iterator_all_nulls());
   fixed_width_column_wrapper<K> expect_grouped_keys{};
   std::vector<size_type> expect_group_offsets = {0};
   test_groups(keys, expect_grouped_keys, expect_group_offsets);
@@ -82,7 +83,7 @@ TYPED_TEST(groupby_group_keys_and_values_test, some_nulls)
   using V = TypeParam;
 
   fixed_width_column_wrapper<K> keys({1, 1, 3, 2, 1, 2}, {1, 0, 1, 0, 0, 1});
-  fixed_width_column_wrapper<K> expect_grouped_keys({1, 2, 3}, all_valid());
+  fixed_width_column_wrapper<K> expect_grouped_keys({1, 2, 3}, iterator_no_null());
   fixed_width_column_wrapper<V> values({1, 2, 3, 4, 5, 6});
   fixed_width_column_wrapper<V> expect_grouped_values({1, 6, 3});
   std::vector<size_type> expect_group_offsets = {0, 1, 2, 3};
diff --git a/cpp/tests/groupby/keys_tests.cpp b/cpp/tests/groupby/keys_tests.cpp
index 78299e1a18c..2b744ad8334 100644
--- a/cpp/tests/groupby/keys_tests.cpp
+++ b/cpp/tests/groupby/keys_tests.cpp
@@ -18,6 +18,7 @@
 
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/detail/aggregation/aggregation.hpp>
@@ -58,7 +59,7 @@ TYPED_TEST(groupby_keys_test, zero_valid_keys)
   using R = cudf::detail::target_type_t<V, aggregation::COUNT_VALID>;
 
   // clang-format off
-  fixed_width_column_wrapper<K> keys      ( { 1, 2, 3}, all_null() );
+  fixed_width_column_wrapper<K> keys      ( { 1, 2, 3}, iterator_all_nulls() );
   fixed_width_column_wrapper<V> vals        { 3, 4, 5};
 
   fixed_width_column_wrapper<K> expect_keys { };
@@ -81,7 +82,7 @@ TYPED_TEST(groupby_keys_test, some_null_keys)
   fixed_width_column_wrapper<V> vals        { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4};
 
                                         //  { 1, 1, 1,  2, 2, 2, 2,  3, 3,  4}
-  fixed_width_column_wrapper<K> expect_keys({ 1,        2,           3,     4}, all_valid());
+  fixed_width_column_wrapper<K> expect_keys({ 1,        2,           3,     4}, iterator_no_null() );
                                         //  { 0, 3, 6,  1, 4, 5, 9,  2, 8,  -}
   fixed_width_column_wrapper<R> expect_vals { 3,        4,           2,     1};
   // clang-format on
@@ -180,7 +181,7 @@ TYPED_TEST(groupby_keys_test, pre_sorted_keys_nullable)
                                             { 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1});
   fixed_width_column_wrapper<V> vals        { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4};
 
-  fixed_width_column_wrapper<K> expect_keys({ 1,       2,          3,       4}, all_valid());
+  fixed_width_column_wrapper<K> expect_keys({ 1,       2,          3,       4}, iterator_no_null() );
   fixed_width_column_wrapper<R> expect_vals { 3,       15,         17,      4};
   // clang-format on
 
diff --git a/cpp/tests/groupby/max_scan_tests.cpp b/cpp/tests/groupby/max_scan_tests.cpp
index c1fc48ca698..d473a5c53b9 100644
--- a/cpp/tests/groupby/max_scan_tests.cpp
+++ b/cpp/tests/groupby/max_scan_tests.cpp
@@ -18,6 +18,7 @@
 
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/detail/aggregation/aggregation.hpp>
@@ -77,7 +78,7 @@ TYPED_TEST(groupby_max_scan_test, zero_valid_keys)
   using result_wrapper = typename TestFixture::result_wrapper;
 
   // clang-format off
-  key_wrapper keys(  {1, 2, 3}, all_null());
+  key_wrapper keys(  {1, 2, 3}, iterator_all_nulls());
   value_wrapper vals({3, 4, 5});
 
   key_wrapper expect_keys{};
@@ -95,10 +96,10 @@ TYPED_TEST(groupby_max_scan_test, zero_valid_values)
 
   // clang-format off
   key_wrapper keys   {1, 1, 1};
-  value_wrapper vals({3, 4, 5}, all_null());
+  value_wrapper vals({3, 4, 5}, iterator_all_nulls());
 
   key_wrapper expect_keys    {1, 1, 1};
-  result_wrapper expect_vals({-1, -1, -1}, all_null());
+  result_wrapper expect_vals({-1, -1, -1}, iterator_all_nulls());
   // clang-format on
 
   auto agg = cudf::make_max_aggregation();
@@ -115,7 +116,7 @@ TYPED_TEST(groupby_max_scan_test, null_keys_and_values)
   value_wrapper vals({5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 4}, {0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0});
 
                          //  {1, 1, 1, 2, 2, 2, 2, 3,   _, 3, 4}
-  key_wrapper expect_keys(   {1, 1, 1, 2, 2, 2, 2, 3,      3, 4}, all_valid());
+  key_wrapper expect_keys(   {1, 1, 1, 2, 2, 2, 2, 3,      3, 4}, iterator_no_null() );
                          //  { -, 3, 6, 1, 4,  -, 9, 2, _, 8, -}
   result_wrapper expect_vals({-1, 8, 8, 6, 9, -1, 9, 7,    7, -1},
                              { 0, 1, 1, 1, 1,  0, 1, 1,    1, 0});
diff --git a/cpp/tests/groupby/max_tests.cpp b/cpp/tests/groupby/max_tests.cpp
index 30720998fe0..5d9af9df777 100644
--- a/cpp/tests/groupby/max_tests.cpp
+++ b/cpp/tests/groupby/max_tests.cpp
@@ -18,6 +18,7 @@
 
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/detail/aggregation/aggregation.hpp>
@@ -73,7 +74,7 @@ TYPED_TEST(groupby_max_test, zero_valid_keys)
   using V = TypeParam;
   using R = cudf::detail::target_type_t<V, aggregation::MAX>;
 
-  fixed_width_column_wrapper<K> keys({1, 2, 3}, all_null());
+  fixed_width_column_wrapper<K> keys({1, 2, 3}, iterator_all_nulls());
   fixed_width_column_wrapper<V> vals({3, 4, 5});
 
   fixed_width_column_wrapper<K> expect_keys{};
@@ -92,10 +93,10 @@ TYPED_TEST(groupby_max_test, zero_valid_values)
   using R = cudf::detail::target_type_t<V, aggregation::MAX>;
 
   fixed_width_column_wrapper<K> keys{1, 1, 1};
-  fixed_width_column_wrapper<V> vals({3, 4, 5}, all_null());
+  fixed_width_column_wrapper<V> vals({3, 4, 5}, iterator_all_nulls());
 
   fixed_width_column_wrapper<K> expect_keys{1};
-  fixed_width_column_wrapper<R> expect_vals({0}, all_null());
+  fixed_width_column_wrapper<R> expect_vals({0}, iterator_all_nulls());
 
   auto agg = cudf::make_max_aggregation();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
@@ -115,7 +116,7 @@ TYPED_TEST(groupby_max_test, null_keys_and_values)
                                      {1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0});
 
   //  { 1, 1,     2, 2, 2,   3, 3,    4}
-  fixed_width_column_wrapper<K> expect_keys({1, 2, 3, 4}, all_valid());
+  fixed_width_column_wrapper<K> expect_keys({1, 2, 3, 4}, iterator_no_null());
   //  { 0, 3,     1, 4, 5,   2, 8,    -}
   fixed_width_column_wrapper<R> expect_vals({3, 5, 8, 0}, {1, 1, 1, 0});
 
@@ -147,10 +148,10 @@ TEST_F(groupby_max_string_test, basic)
 TEST_F(groupby_max_string_test, zero_valid_values)
 {
   fixed_width_column_wrapper<K> keys{1, 1, 1};
-  strings_column_wrapper vals({"año", "bit", "₹1"}, all_null());
+  strings_column_wrapper vals({"año", "bit", "₹1"}, iterator_all_nulls());
 
   fixed_width_column_wrapper<K> expect_keys{1};
-  strings_column_wrapper expect_vals({""}, all_null());
+  strings_column_wrapper expect_vals({""}, iterator_all_nulls());
 
   auto agg = cudf::make_max_aggregation();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
diff --git a/cpp/tests/groupby/mean_tests.cpp b/cpp/tests/groupby/mean_tests.cpp
index 026d999e172..805fb0b0400 100644
--- a/cpp/tests/groupby/mean_tests.cpp
+++ b/cpp/tests/groupby/mean_tests.cpp
@@ -19,6 +19,7 @@
 
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_list_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
@@ -88,7 +89,7 @@ TYPED_TEST(groupby_mean_test, zero_valid_keys)
   using V = TypeParam;
   using R = cudf::detail::target_type_t<V, aggregation::MEAN>;
 
-  fixed_width_column_wrapper<K> keys({1, 2, 3}, all_null());
+  fixed_width_column_wrapper<K> keys({1, 2, 3}, iterator_all_nulls());
   fixed_width_column_wrapper<V> vals{3, 4, 5};
 
   fixed_width_column_wrapper<K> expect_keys{};
@@ -104,10 +105,10 @@ TYPED_TEST(groupby_mean_test, zero_valid_values)
   using R = cudf::detail::target_type_t<V, aggregation::MEAN>;
 
   fixed_width_column_wrapper<K> keys{1, 1, 1};
-  fixed_width_column_wrapper<V> vals({3, 4, 5}, all_null());
+  fixed_width_column_wrapper<V> vals({3, 4, 5}, iterator_all_nulls());
 
   fixed_width_column_wrapper<K> expect_keys{1};
-  fixed_width_column_wrapper<R> expect_vals({0}, all_null());
+  fixed_width_column_wrapper<R> expect_vals({0}, iterator_all_nulls());
 
   auto agg = cudf::make_mean_aggregation();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
@@ -126,7 +127,7 @@ TYPED_TEST(groupby_mean_test, null_keys_and_values)
 
   // clang-format off
   //                                        {1, 1,     2, 2, 2,   3, 3,    4}
-  fixed_width_column_wrapper<K> expect_keys({1,        2,         3,       4}, all_valid());
+  fixed_width_column_wrapper<K> expect_keys({1,        2,         3,       4}, iterator_no_null());
   //                                        {3, 6,     1, 4, 9,   2, 8,    -}
   std::vector<RT> expect_v = convert<RT>(   {4.5,      14. / 3,   5.,      0.});
   fixed_width_column_wrapper<R, RT> expect_vals(expect_v.cbegin(), expect_v.cend(), {1, 1, 1, 0});
diff --git a/cpp/tests/groupby/median_tests.cpp b/cpp/tests/groupby/median_tests.cpp
index d83e9ec946b..c87121df555 100644
--- a/cpp/tests/groupby/median_tests.cpp
+++ b/cpp/tests/groupby/median_tests.cpp
@@ -18,6 +18,7 @@
 
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/detail/aggregation/aggregation.hpp>
@@ -45,7 +46,7 @@ TYPED_TEST(groupby_median_test, basic)
   //                                       {1, 1, 1, 2, 2, 2, 2, 3, 3, 3}
   fixed_width_column_wrapper<K> expect_keys{1,       2,          3};
   //                                        {0, 3, 6, 1, 4, 5, 9, 2, 7, 8}
-  fixed_width_column_wrapper<R> expect_vals({3.,     4.5,        7.}, all_valid());
+  fixed_width_column_wrapper<R> expect_vals({3.,     4.5,        7.}, iterator_no_null());
   // clang-format on
 
   auto agg = cudf::make_median_aggregation();
@@ -72,7 +73,7 @@ TYPED_TEST(groupby_median_test, zero_valid_keys)
   using V = TypeParam;
   using R = cudf::detail::target_type_t<V, aggregation::MEDIAN>;
 
-  fixed_width_column_wrapper<K> keys({1, 2, 3}, all_null());
+  fixed_width_column_wrapper<K> keys({1, 2, 3}, iterator_all_nulls());
   fixed_width_column_wrapper<V> vals{3, 4, 5};
 
   fixed_width_column_wrapper<K> expect_keys{};
@@ -88,10 +89,10 @@ TYPED_TEST(groupby_median_test, zero_valid_values)
   using R = cudf::detail::target_type_t<V, aggregation::MEDIAN>;
 
   fixed_width_column_wrapper<K> keys{1, 1, 1};
-  fixed_width_column_wrapper<V> vals({3, 4, 5}, all_null());
+  fixed_width_column_wrapper<V> vals({3, 4, 5}, iterator_all_nulls());
 
   fixed_width_column_wrapper<K> expect_keys{1};
-  fixed_width_column_wrapper<R> expect_vals({0}, all_null());
+  fixed_width_column_wrapper<R> expect_vals({0}, iterator_all_nulls());
 
   auto agg = cudf::make_median_aggregation();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
@@ -108,7 +109,7 @@ TYPED_TEST(groupby_median_test, null_keys_and_values)
                                      {0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0});
 
   //  { 1, 1,     2, 2, 2,   3, 3,    4}
-  fixed_width_column_wrapper<K> expect_keys({1, 2, 3, 4}, all_valid());
+  fixed_width_column_wrapper<K> expect_keys({1, 2, 3, 4}, iterator_no_null());
   //  { 3, 6,     1, 4, 9,   2, 8,    -}
   fixed_width_column_wrapper<R> expect_vals({4.5, 4., 5., 0.}, {1, 1, 1, 0});
 
@@ -128,7 +129,7 @@ TYPED_TEST(groupby_median_test, dictionary)
   //                                        {1, 1, 1, 2, 2, 2, 2, 3, 3, 3}
   fixed_width_column_wrapper<K> expect_keys({1,       2,          3      });
   //                                        {0, 3, 6, 1, 4, 5, 9, 2, 7, 8}
-  fixed_width_column_wrapper<R> expect_vals({3.,       4.5,       7.     }, all_valid());
+  fixed_width_column_wrapper<R> expect_vals({3.,       4.5,       7.     }, iterator_no_null());
   // clang-format on
 
   test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_median_aggregation());
diff --git a/cpp/tests/groupby/min_scan_tests.cpp b/cpp/tests/groupby/min_scan_tests.cpp
index d3186d880cc..dc63f7995f1 100644
--- a/cpp/tests/groupby/min_scan_tests.cpp
+++ b/cpp/tests/groupby/min_scan_tests.cpp
@@ -18,6 +18,7 @@
 
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/detail/aggregation/aggregation.hpp>
@@ -75,7 +76,7 @@ TYPED_TEST(groupby_min_scan_test, zero_valid_keys)
   using result_wrapper = typename TestFixture::result_wrapper;
 
   // clang-format off
-  key_wrapper keys({1, 2, 3}, all_null());
+  key_wrapper keys({1, 2, 3}, iterator_all_nulls());
   value_wrapper vals({3, 4, 5});
 
   key_wrapper expect_keys{};
@@ -93,10 +94,10 @@ TYPED_TEST(groupby_min_scan_test, zero_valid_values)
 
   // clang-format off
   key_wrapper keys   {1, 1, 1};
-  value_wrapper vals({3, 4, 5}, all_null());
+  value_wrapper vals({3, 4, 5}, iterator_all_nulls());
 
   key_wrapper expect_keys    {1, 1, 1};
-  result_wrapper expect_vals({-1, -1, -1}, all_null());
+  result_wrapper expect_vals({-1, -1, -1}, iterator_all_nulls());
   // clang-format on
 
   auto agg = cudf::make_min_aggregation();
@@ -113,7 +114,7 @@ TYPED_TEST(groupby_min_scan_test, null_keys_and_values)
   value_wrapper vals({5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 4}, {0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0});
 
                          //  { 1, 1, 1, 2, 2,  2, 2, 3, _, 3, 4}
-  key_wrapper expect_keys(   { 1, 1, 1, 2, 2,  2, 2, 3,    3, 4}, all_valid());
+  key_wrapper expect_keys(   { 1, 1, 1, 2, 2,  2, 2, 3,    3, 4}, iterator_no_null());
                          //  { _, 8, 1, 6, 9,  _, 4, 7, 2, 3, _}
   result_wrapper expect_vals({-1, 8, 1, 6, 6, -1, 4, 7,    3, -1},
                              { 0, 1, 1, 1, 1,  0, 1, 1,    1, 0});
diff --git a/cpp/tests/groupby/min_tests.cpp b/cpp/tests/groupby/min_tests.cpp
index f6340a4838b..c5552d96853 100644
--- a/cpp/tests/groupby/min_tests.cpp
+++ b/cpp/tests/groupby/min_tests.cpp
@@ -18,6 +18,7 @@
 
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/detail/aggregation/aggregation.hpp>
@@ -73,7 +74,7 @@ TYPED_TEST(groupby_min_test, zero_valid_keys)
   using V = TypeParam;
   using R = cudf::detail::target_type_t<V, aggregation::MIN>;
 
-  fixed_width_column_wrapper<K> keys({1, 2, 3}, all_null());
+  fixed_width_column_wrapper<K> keys({1, 2, 3}, iterator_all_nulls());
   fixed_width_column_wrapper<V> vals({3, 4, 5});
 
   fixed_width_column_wrapper<K> expect_keys{};
@@ -92,10 +93,10 @@ TYPED_TEST(groupby_min_test, zero_valid_values)
   using R = cudf::detail::target_type_t<V, aggregation::MIN>;
 
   fixed_width_column_wrapper<K> keys{1, 1, 1};
-  fixed_width_column_wrapper<V> vals({3, 4, 5}, all_null());
+  fixed_width_column_wrapper<V> vals({3, 4, 5}, iterator_all_nulls());
 
   fixed_width_column_wrapper<K> expect_keys{1};
-  fixed_width_column_wrapper<R> expect_vals({0}, all_null());
+  fixed_width_column_wrapper<R> expect_vals({0}, iterator_all_nulls());
 
   auto agg = cudf::make_min_aggregation();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
@@ -115,7 +116,7 @@ TYPED_TEST(groupby_min_test, null_keys_and_values)
                                      {0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0});
 
   //  { 1, 1,     2, 2, 2,   3, 3,    4}
-  fixed_width_column_wrapper<K> expect_keys({1, 2, 3, 4}, all_valid());
+  fixed_width_column_wrapper<K> expect_keys({1, 2, 3, 4}, iterator_no_null());
   //  { 3, 6,     1, 4, 9,   2, 8,    -}
   fixed_width_column_wrapper<R> expect_vals({3, 1, 2, 0}, {1, 1, 1, 0});
 
@@ -147,10 +148,10 @@ TEST_F(groupby_min_string_test, basic)
 TEST_F(groupby_min_string_test, zero_valid_values)
 {
   fixed_width_column_wrapper<K> keys{1, 1, 1};
-  strings_column_wrapper vals({"año", "bit", "₹1"}, all_null());
+  strings_column_wrapper vals({"año", "bit", "₹1"}, iterator_all_nulls());
 
   fixed_width_column_wrapper<K> expect_keys{1};
-  strings_column_wrapper expect_vals({""}, all_null());
+  strings_column_wrapper expect_vals({""}, iterator_all_nulls());
 
   auto agg = cudf::make_min_aggregation();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
diff --git a/cpp/tests/groupby/nth_element_tests.cpp b/cpp/tests/groupby/nth_element_tests.cpp
index 5630cba09da..a179159e592 100644
--- a/cpp/tests/groupby/nth_element_tests.cpp
+++ b/cpp/tests/groupby/nth_element_tests.cpp
@@ -18,6 +18,7 @@
 
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/detail/aggregation/aggregation.hpp>
@@ -141,7 +142,7 @@ TYPED_TEST(groupby_nth_element_test, zero_valid_keys)
   using V = TypeParam;
   using R = cudf::detail::target_type_t<V, aggregation::NTH_ELEMENT>;
 
-  fixed_width_column_wrapper<K> keys({1, 2, 3}, all_null());
+  fixed_width_column_wrapper<K> keys({1, 2, 3}, iterator_all_nulls());
   fixed_width_column_wrapper<V, int32_t> vals({3, 4, 5});
 
   fixed_width_column_wrapper<K> expect_keys{};
@@ -158,10 +159,10 @@ TYPED_TEST(groupby_nth_element_test, zero_valid_values)
   using R = cudf::detail::target_type_t<V, aggregation::NTH_ELEMENT>;
 
   fixed_width_column_wrapper<K> keys{1, 1, 1};
-  fixed_width_column_wrapper<V, int32_t> vals({3, 4, 5}, all_null());
+  fixed_width_column_wrapper<V, int32_t> vals({3, 4, 5}, iterator_all_nulls());
 
   fixed_width_column_wrapper<K> expect_keys{1};
-  fixed_width_column_wrapper<R, int32_t> expect_vals({3}, all_null());
+  fixed_width_column_wrapper<R, int32_t> expect_vals({3}, iterator_all_nulls());
 
   auto agg = cudf::make_nth_element_aggregation(0);
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
@@ -178,7 +179,7 @@ TYPED_TEST(groupby_nth_element_test, null_keys_and_values)
   fixed_width_column_wrapper<V, int32_t> vals({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4},
                                               {0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0});
 
-  fixed_width_column_wrapper<K> expect_keys({1, 2, 3, 4}, all_valid());
+  fixed_width_column_wrapper<K> expect_keys({1, 2, 3, 4}, iterator_no_null());
   //keys                                    {1, 1, 1   2,2,2,2    3, 3,    4}
   //vals                                    {-,3,6,    1,4,-,9,  2,8,      -}
   fixed_width_column_wrapper<R, int32_t> expect_vals({-1, 1, 2, -1}, {0, 1, 1, 0});
@@ -198,7 +199,7 @@ TYPED_TEST(groupby_nth_element_test, null_keys_and_values_out_of_bounds)
   fixed_width_column_wrapper<V, int32_t> vals({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4},
                                               {0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0});
   //                                        {1, 1, 1    2, 2, 2,    3, 3,   4}
-  fixed_width_column_wrapper<K> expect_keys({1, 2, 3, 4}, all_valid());
+  fixed_width_column_wrapper<K> expect_keys({1, 2, 3, 4}, iterator_no_null());
   //                                        {-,3,6,     1,4,-,9,    2,8,    -}
   //                                         value,     null,       out,    out
   fixed_width_column_wrapper<R, int32_t> expect_vals({6, -1, -1, -1}, {1, 0, 0, 0});
@@ -218,7 +219,7 @@ TYPED_TEST(groupby_nth_element_test, exclude_nulls)
   fixed_width_column_wrapper<V, int32_t> vals({0, 1, 2, 2, 3, 4, 5, 6, 7, 8, 9, 4, 4, 2},
                                               {0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0});
 
-  fixed_width_column_wrapper<K> expect_keys({1, 2, 3, 4}, all_valid());
+  fixed_width_column_wrapper<K> expect_keys({1, 2, 3, 4}, iterator_no_null());
   //keys                                    {1, 1, 1    2, 2, 2, 2      3, 3, 3    4}
   //vals                                    {-, 3, 6    1, 4, -, 9, -   2, 2, 8,   4,-}
   //                                      0  null,      value,          value,     null
@@ -260,7 +261,7 @@ TYPED_TEST(groupby_nth_element_test, exclude_nulls_negative_index)
   fixed_width_column_wrapper<V, int32_t> vals({0, 1, 2, 2, 3, 4, 5, 6, 7, 8, 9, 4, 4, 2},
                                               {0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0});
 
-  fixed_width_column_wrapper<K> expect_keys({1, 2, 3, 4}, all_valid());
+  fixed_width_column_wrapper<K> expect_keys({1, 2, 3, 4}, iterator_no_null());
   //keys                                    {1, 1, 1    2, 2, 2,        3, 3,       4}
   //vals                                    {-, 3, 6    1, 4, -, 9, -   2, 2, 8,    4,-}
   //                                      0  null,      value,          value,      value
diff --git a/cpp/tests/groupby/nunique_tests.cpp b/cpp/tests/groupby/nunique_tests.cpp
index acfa1c953e2..3ec1ad48511 100644
--- a/cpp/tests/groupby/nunique_tests.cpp
+++ b/cpp/tests/groupby/nunique_tests.cpp
@@ -18,6 +18,7 @@
 
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/detail/aggregation/aggregation.hpp>
@@ -92,7 +93,7 @@ TYPED_TEST(groupby_nunique_test, zero_valid_keys)
   using V = TypeParam;
   using R = cudf::detail::target_type_t<V, aggregation::NUNIQUE>;
 
-  fixed_width_column_wrapper<K> keys({1, 2, 3}, all_null());
+  fixed_width_column_wrapper<K> keys({1, 2, 3}, iterator_all_nulls());
   fixed_width_column_wrapper<V> vals({3, 4, 5});
 
   fixed_width_column_wrapper<K> expect_keys{};
@@ -108,7 +109,7 @@ TYPED_TEST(groupby_nunique_test, zero_valid_values)
   using R = cudf::detail::target_type_t<V, aggregation::NUNIQUE>;
 
   fixed_width_column_wrapper<K> keys{1, 1, 1};
-  fixed_width_column_wrapper<V> vals({3, 4, 5}, all_null());
+  fixed_width_column_wrapper<V> vals({3, 4, 5}, iterator_all_nulls());
 
   fixed_width_column_wrapper<K> expect_keys{1};
   fixed_width_column_wrapper<R> expect_vals{0};
@@ -128,7 +129,7 @@ TYPED_TEST(groupby_nunique_test, null_keys_and_values)
                                      {0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0});
 
   //                                        {1, 1,     2, 2, 2,   3, 3,    4}
-  fixed_width_column_wrapper<K> expect_keys({1, 2, 3, 4}, all_valid());
+  fixed_width_column_wrapper<K> expect_keys({1, 2, 3, 4}, iterator_no_null());
   // all unique values only                 {3, 6,     1, 4, 9,   2, 8,    -}
   fixed_width_column_wrapper<R> expect_vals{2, 3, 2, 0};
   fixed_width_column_wrapper<R> expect_bool_vals{1, 1, 1, 0};
@@ -151,7 +152,7 @@ TYPED_TEST(groupby_nunique_test, null_keys_and_values_with_duplicates)
                                      {0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0});
 
   //  { 1, 1,     2, 2, 2,    3, 3,    4}
-  fixed_width_column_wrapper<K> expect_keys({1, 2, 3, 4}, all_valid());
+  fixed_width_column_wrapper<K> expect_keys({1, 2, 3, 4}, iterator_no_null());
   //  { 3, 6,-    1, 4, 9,-   2*, 8,   -*}
   //  unique,     with null,  dup,     dup null
   fixed_width_column_wrapper<R> expect_vals{2, 3, 2, 0};
@@ -175,7 +176,7 @@ TYPED_TEST(groupby_nunique_test, include_nulls)
                                      {0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0});
 
   //  { 1, 1,     2, 2, 2,    3, 3,    4}
-  fixed_width_column_wrapper<K> expect_keys({1, 2, 3, 4}, all_valid());
+  fixed_width_column_wrapper<K> expect_keys({1, 2, 3, 4}, iterator_no_null());
   //  { 3, 6,-    1, 4, 9,-   2*, 8,   -*}
   //  unique,     with null,  dup,     dup null
   fixed_width_column_wrapper<R> expect_vals{3, 4, 2, 1};
@@ -200,7 +201,7 @@ TYPED_TEST(groupby_nunique_test, dictionary)
                                      {0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0});
 
   // { 1, 1,   2, 2, 2,   3, 3,   4}
-  fixed_width_column_wrapper<K> expect_keys({1, 2, 3, 4}, all_valid());
+  fixed_width_column_wrapper<K> expect_keys({1, 2, 3, 4}, iterator_no_null());
   // { 3, 6,-  1, 4, 9,-  2*, 8,  -*}
   //  unique,  with null, dup,    dup null
   fixed_width_column_wrapper<R> expect_fixed_vals({3, 4, 2, 1});
diff --git a/cpp/tests/groupby/product_tests.cpp b/cpp/tests/groupby/product_tests.cpp
index d2db409711d..27d7eb071bf 100644
--- a/cpp/tests/groupby/product_tests.cpp
+++ b/cpp/tests/groupby/product_tests.cpp
@@ -18,6 +18,7 @@
 
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/detail/aggregation/aggregation.hpp>
@@ -45,7 +46,7 @@ TYPED_TEST(groupby_product_test, basic)
                                         //  { 1, 1, 1,  2, 2, 2, 2,  3, 3, 3}
   fixed_width_column_wrapper<K> expect_keys { 1,        2,           3      };
                                         //  { 0, 3, 6,  1, 4, 5, 9,  2, 7, 8}
-  fixed_width_column_wrapper<R> expect_vals({   0.,       180.,      112. }, all_valid());
+  fixed_width_column_wrapper<R> expect_vals({   0.,       180.,      112. }, iterator_no_null());
   // clang-format on
 
   test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_product_aggregation());
@@ -70,7 +71,7 @@ TYPED_TEST(groupby_product_test, zero_valid_keys)
   using V = TypeParam;
   using R = cudf::detail::target_type_t<V, aggregation::PRODUCT>;
 
-  fixed_width_column_wrapper<K> keys({1, 2, 3}, all_null());
+  fixed_width_column_wrapper<K> keys({1, 2, 3}, iterator_all_nulls());
   fixed_width_column_wrapper<V> vals{3, 4, 5};
 
   fixed_width_column_wrapper<K> expect_keys{};
@@ -85,10 +86,10 @@ TYPED_TEST(groupby_product_test, zero_valid_values)
   using R = cudf::detail::target_type_t<V, aggregation::PRODUCT>;
 
   fixed_width_column_wrapper<K> keys{1, 1, 1};
-  fixed_width_column_wrapper<V> vals({3, 4, 5}, all_null());
+  fixed_width_column_wrapper<V> vals({3, 4, 5}, iterator_all_nulls());
 
   fixed_width_column_wrapper<K> expect_keys{1};
-  fixed_width_column_wrapper<R> expect_vals({0}, all_null());
+  fixed_width_column_wrapper<R> expect_vals({0}, iterator_all_nulls());
 
   test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_product_aggregation());
 }
@@ -105,7 +106,7 @@ TYPED_TEST(groupby_product_test, null_keys_and_values)
                                             { 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0});
 
                                         //  { 1, 1,     2, 2, 2,   3, 3,    4}
-  fixed_width_column_wrapper<K> expect_keys({ 1,        2,         3,       4}, all_valid());
+  fixed_width_column_wrapper<K> expect_keys({ 1,        2,         3,       4}, iterator_no_null());
                                         //  { _, 3, 6,  1, 4, 9,   2, 8,    _}
   fixed_width_column_wrapper<R> expect_vals({ 18.,      36.,       16.,     3.},
                                             { 1,        1,         1,       0});
@@ -126,7 +127,7 @@ TYPED_TEST(groupby_product_test, dictionary)
                                         //  { 1, 1, 1,  2, 2, 2, 2,  3, 3, 3}
   fixed_width_column_wrapper<K> expect_keys({ 1,        2,           3      });
                                         //  { 0, 3, 6,  1, 4, 5, 9,  2, 7, 8}
-  fixed_width_column_wrapper<R> expect_vals({  0.,     180.,        112. }, all_valid());
+  fixed_width_column_wrapper<R> expect_vals({  0.,     180.,        112. }, iterator_no_null());
   // clang-format on
 
   test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_product_aggregation());
@@ -145,7 +146,7 @@ TYPED_TEST(groupby_product_test, dictionary_with_nulls)
                                         //  { 1, 1, 1,  2, 2, 2, 2,  3, 3, 3}
   fixed_width_column_wrapper<K> expect_keys({ 1,        2,           3      });
                                         //  { 0, 3, 6,  @, 4, 5, 9,  @, 7, 8}
-  fixed_width_column_wrapper<R> expect_vals({  0.,     180.,        56. }, all_valid());
+  fixed_width_column_wrapper<R> expect_vals({  0.,     180.,        56. }, iterator_no_null());
   // clang-format on
 
   test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_product_aggregation());
diff --git a/cpp/tests/groupby/quantile_tests.cpp b/cpp/tests/groupby/quantile_tests.cpp
index babd84d4334..7509c1084dc 100644
--- a/cpp/tests/groupby/quantile_tests.cpp
+++ b/cpp/tests/groupby/quantile_tests.cpp
@@ -18,6 +18,7 @@
 
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/detail/aggregation/aggregation.hpp>
@@ -45,7 +46,7 @@ TYPED_TEST(groupby_quantile_test, basic)
   //                                       {1, 1, 1, 2, 2, 2, 2, 3, 3, 3}
   fixed_width_column_wrapper<K> expect_keys{1, 2, 3};
   //                                       {0, 3, 6, 1, 4, 5, 9, 2, 7, 8}
-  fixed_width_column_wrapper<R> expect_vals({3., 4.5, 7.}, all_valid());
+  fixed_width_column_wrapper<R> expect_vals({3., 4.5, 7.}, iterator_no_null());
   // clang-format on
 
   auto agg = cudf::make_quantile_aggregation({0.5}, interpolation::LINEAR);
@@ -72,7 +73,7 @@ TYPED_TEST(groupby_quantile_test, zero_valid_keys)
   using V = TypeParam;
   using R = cudf::detail::target_type_t<V, aggregation::QUANTILE>;
 
-  fixed_width_column_wrapper<K> keys({1, 2, 3}, all_null());
+  fixed_width_column_wrapper<K> keys({1, 2, 3}, iterator_all_nulls());
   fixed_width_column_wrapper<V> vals{3, 4, 5};
 
   fixed_width_column_wrapper<K> expect_keys{};
@@ -88,10 +89,10 @@ TYPED_TEST(groupby_quantile_test, zero_valid_values)
   using R = cudf::detail::target_type_t<V, aggregation::QUANTILE>;
 
   fixed_width_column_wrapper<K> keys{1, 1, 1};
-  fixed_width_column_wrapper<V> vals({3, 4, 5}, all_null());
+  fixed_width_column_wrapper<V> vals({3, 4, 5}, iterator_all_nulls());
 
   fixed_width_column_wrapper<K> expect_keys{1};
-  fixed_width_column_wrapper<R> expect_vals({0}, all_null());
+  fixed_width_column_wrapper<R> expect_vals({0}, iterator_all_nulls());
 
   auto agg = cudf::make_quantile_aggregation({0.5}, interpolation::LINEAR);
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
@@ -108,7 +109,7 @@ TYPED_TEST(groupby_quantile_test, null_keys_and_values)
                                      {0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0});
 
   //  { 1, 1,     2, 2, 2,   3, 3,    4}
-  fixed_width_column_wrapper<K> expect_keys({1, 2, 3, 4}, all_valid());
+  fixed_width_column_wrapper<K> expect_keys({1, 2, 3, 4}, iterator_no_null());
   //  { 3, 6,     1, 4, 9,   2, 8,    -}
   fixed_width_column_wrapper<R> expect_vals({4.5, 4., 5., 0.}, {1, 1, 1, 0});
 
@@ -128,7 +129,7 @@ TYPED_TEST(groupby_quantile_test, multiple_quantile)
   //                                       {1, 1, 1,   2, 2, 2, 2, 3, 3, 3}
   fixed_width_column_wrapper<K> expect_keys{1, 2, 3};
   //                                        {0, 3, 6,  1, 4, 5, 9, 2, 7, 8}
-  fixed_width_column_wrapper<R> expect_vals({1.5, 4.5, 3.25, 6.,   4.5, 7.5}, all_valid());
+  fixed_width_column_wrapper<R> expect_vals({1.5, 4.5, 3.25, 6.,   4.5, 7.5}, iterator_no_null());
   // clang-format on
 
   auto agg = cudf::make_quantile_aggregation({0.25, 0.75}, interpolation::LINEAR);
@@ -148,27 +149,27 @@ TYPED_TEST(groupby_quantile_test, interpolation_types)
   fixed_width_column_wrapper<K> expect_keys{1, 2, 3};
 
   //                                         {0, 3, 6,  1, 4, 5, 9,  2, 7}
-  fixed_width_column_wrapper<R> expect_vals1({2.4,      4.2,         4.}, all_valid());
+  fixed_width_column_wrapper<R> expect_vals1({2.4,      4.2,         4.}, iterator_no_null());
   auto agg1 = cudf::make_quantile_aggregation({0.4}, interpolation::LINEAR);
   test_single_agg(keys, vals, expect_keys, expect_vals1, std::move(agg1));
 
   //                                         {0, 3, 6,  1, 4, 5, 9,  2, 7}
-  fixed_width_column_wrapper<R> expect_vals2({3,        4,           2}, all_valid());
+  fixed_width_column_wrapper<R> expect_vals2({3,        4,           2}, iterator_no_null());
   auto agg2 = cudf::make_quantile_aggregation({0.4}, interpolation::NEAREST);
   test_single_agg(keys, vals, expect_keys, expect_vals2, std::move(agg2));
 
   //                                         {0, 3, 6,  1, 4, 5, 9,  2, 7}
-  fixed_width_column_wrapper<R> expect_vals3({0,        4,          2}, all_valid());
+  fixed_width_column_wrapper<R> expect_vals3({0,        4,          2}, iterator_no_null());
   auto agg3 = cudf::make_quantile_aggregation({0.4}, interpolation::LOWER);
   test_single_agg(keys, vals, expect_keys, expect_vals3, std::move(agg3));
 
   //                                         {0, 3, 6,  1, 4, 5, 9,  2, 7}
-  fixed_width_column_wrapper<R> expect_vals4({3,        5,           7}, all_valid());
+  fixed_width_column_wrapper<R> expect_vals4({3,        5,           7}, iterator_no_null());
   auto agg4 = cudf::make_quantile_aggregation({0.4}, interpolation::HIGHER);
   test_single_agg(keys, vals, expect_keys, expect_vals4, std::move(agg4));
 
   //                                         {0, 3, 6,  1, 4, 5, 9,  2, 7}
-  fixed_width_column_wrapper<R> expect_vals5({1.5,      4.5,         4.5}, all_valid());
+  fixed_width_column_wrapper<R> expect_vals5({1.5,      4.5,         4.5}, iterator_no_null());
   auto agg5 = cudf::make_quantile_aggregation({0.4}, interpolation::MIDPOINT);
   test_single_agg(keys, vals, expect_keys, expect_vals5, std::move(agg5));
   // clang-format on
@@ -186,7 +187,7 @@ TYPED_TEST(groupby_quantile_test, dictionary)
   //                                        {1, 1, 1, 2, 2, 2, 2, 3, 3, 3}
   fixed_width_column_wrapper<K> expect_keys({1, 2, 3});
   //                                        {0, 3, 6, 1, 4, 5, 9, 2, 7, 8}
-  fixed_width_column_wrapper<R> expect_vals({3.,      4.5,        7.}, all_valid());
+  fixed_width_column_wrapper<R> expect_vals({3.,      4.5,        7.}, iterator_no_null());
   // clang-format on
 
   test_single_agg(keys,
diff --git a/cpp/tests/groupby/replace_nulls_tests.cpp b/cpp/tests/groupby/replace_nulls_tests.cpp
index 527c7dba725..9ac3a11e286 100644
--- a/cpp/tests/groupby/replace_nulls_tests.cpp
+++ b/cpp/tests/groupby/replace_nulls_tests.cpp
@@ -17,14 +17,16 @@
 
 #include <tests/groupby/groupby_test_util.hpp>
 
-#include <cudf/table/table.hpp>
-#include <cudf/table/table_view.hpp>
-#include <cudf/types.hpp>
-
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
+
+#include <cudf/table/table.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+
 namespace cudf {
 namespace test {
 
@@ -56,7 +58,7 @@ TYPED_TEST(GroupbyReplaceNullsFixedWidthTest, PrecedingFill)
   fixed_width_column_wrapper<TypeParam> val({42, 7, 24, 10, 1, 1000}, {1, 1, 1, 0, 0, 0});
 
   fixed_width_column_wrapper<K> expect_key{0, 0, 0, 1, 1, 1};
-  fixed_width_column_wrapper<TypeParam> expect_val({42, 24, 24, 7, 7, 7}, all_valid());
+  fixed_width_column_wrapper<TypeParam> expect_val({42, 24, 24, 7, 7, 7}, iterator_no_null());
 
   TestReplaceNullsGroupbySingle(key, val, expect_key, expect_val, replace_policy::PRECEDING);
 }
@@ -70,7 +72,8 @@ TYPED_TEST(GroupbyReplaceNullsFixedWidthTest, FollowingFill)
                                             {1, 0, 1, 0, 1, 0, 1, 1});
 
   fixed_width_column_wrapper<K> expect_key{0, 0, 0, 1, 1, 1, 1, 1};
-  fixed_width_column_wrapper<TypeParam> expect_val({2, 32, 32, 8, 128, 128, 128, 256}, all_valid());
+  fixed_width_column_wrapper<TypeParam> expect_val({2, 32, 32, 8, 128, 128, 128, 256},
+                                                   iterator_no_null());
 
   TestReplaceNullsGroupbySingle(key, val, expect_key, expect_val, replace_policy::FOLLOWING);
 }
@@ -115,7 +118,8 @@ TEST_F(GroupbyReplaceNullsStringsTest, PrecedingFill)
                              {true, false, true, true, true, false, true});
 
   fixed_width_column_wrapper<K> expect_key{0, 0, 1, 1, 1, 1, 1};
-  strings_column_wrapper expect_val({"y", "42", "xx", "xx", "zzz", "zzz", "one"}, all_valid());
+  strings_column_wrapper expect_val({"y", "42", "xx", "xx", "zzz", "zzz", "one"},
+                                    iterator_no_null());
 
   TestReplaceNullsGroupbySingle(key, val, expect_key, expect_val, replace_policy::PRECEDING);
 }
@@ -129,7 +133,8 @@ TEST_F(GroupbyReplaceNullsStringsTest, FollowingFill)
                              {true, false, false, true, true, false, true});
 
   fixed_width_column_wrapper<K> expect_key{0, 0, 1, 1, 1, 1, 1};
-  strings_column_wrapper expect_val({"42", "42", "xx", "zzz", "zzz", "one", "one"}, all_valid());
+  strings_column_wrapper expect_val({"42", "42", "xx", "zzz", "zzz", "one", "one"},
+                                    iterator_no_null());
 
   TestReplaceNullsGroupbySingle(key, val, expect_key, expect_val, replace_policy::FOLLOWING);
 }
diff --git a/cpp/tests/groupby/std_tests.cpp b/cpp/tests/groupby/std_tests.cpp
index f9980f3f5a6..4fd9afebc40 100644
--- a/cpp/tests/groupby/std_tests.cpp
+++ b/cpp/tests/groupby/std_tests.cpp
@@ -20,6 +20,7 @@
 
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/detail/aggregation/aggregation.hpp>
@@ -47,7 +48,7 @@ TYPED_TEST(groupby_std_test, basic)
   //                                        {1, 1, 1,  2, 2, 2, 2,    3, 3, 3}
   fixed_width_column_wrapper<K>  expect_keys{1,        2,             3};
   //                                        {0, 3, 6,  1, 4, 5, 9,    2, 7, 8}
-  fixed_width_column_wrapper<R> expect_vals({3.,       sqrt(131./12), sqrt(31./3)}, all_valid());
+  fixed_width_column_wrapper<R> expect_vals({3.,       sqrt(131./12), sqrt(31./3)}, iterator_no_null());
   // clang-format on
 
   auto agg = cudf::make_std_aggregation();
@@ -74,7 +75,7 @@ TYPED_TEST(groupby_std_test, zero_valid_keys)
   using V = TypeParam;
   using R = cudf::detail::target_type_t<V, aggregation::STD>;
 
-  fixed_width_column_wrapper<K> keys({1, 2, 3}, all_null());
+  fixed_width_column_wrapper<K> keys({1, 2, 3}, iterator_all_nulls());
   fixed_width_column_wrapper<V> vals{3, 4, 5};
 
   fixed_width_column_wrapper<K> expect_keys{};
@@ -90,10 +91,10 @@ TYPED_TEST(groupby_std_test, zero_valid_values)
   using R = cudf::detail::target_type_t<V, aggregation::STD>;
 
   fixed_width_column_wrapper<K> keys{1, 1, 1};
-  fixed_width_column_wrapper<V> vals({3, 4, 5}, all_null());
+  fixed_width_column_wrapper<V> vals({3, 4, 5}, iterator_all_nulls());
 
   fixed_width_column_wrapper<K> expect_keys{1};
-  fixed_width_column_wrapper<R> expect_vals({0}, all_null());
+  fixed_width_column_wrapper<R> expect_vals({0}, iterator_all_nulls());
 
   auto agg = cudf::make_std_aggregation();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
@@ -110,7 +111,7 @@ TYPED_TEST(groupby_std_test, null_keys_and_values)
                                      {0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1});
 
   //                                        { 1, 1,     2, 2, 2,   3, 3,       4}
-  fixed_width_column_wrapper<K> expect_keys({1, 2, 3, 4}, all_valid());
+  fixed_width_column_wrapper<K> expect_keys({1, 2, 3, 4}, iterator_no_null());
   //                                        { 3, 6,     1, 4, 9,   2, 8,       3}
   fixed_width_column_wrapper<R> expect_vals({3 / sqrt(2), 7 / sqrt(3), 3 * sqrt(2), 0.},
                                             {1, 1, 1, 0});
@@ -130,7 +131,7 @@ TYPED_TEST(groupby_std_test, ddof_non_default)
                                      {0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1});
 
   //                                        { 1, 1,     2, 2, 2,   3, 3,    4}
-  fixed_width_column_wrapper<K> expect_keys({1, 2, 3, 4}, all_valid());
+  fixed_width_column_wrapper<K> expect_keys({1, 2, 3, 4}, iterator_no_null());
   //                                        { 3, 6,     1, 4, 9,   2, 8,    3}
   fixed_width_column_wrapper<R> expect_vals({0., 7 * sqrt(2. / 3), 0., 0.}, {0, 1, 0, 0});
 
@@ -150,7 +151,7 @@ TYPED_TEST(groupby_std_test, dictionary)
   //                                        {1, 1, 1,  2, 2, 2, 2,    3, 3, 3}
   fixed_width_column_wrapper<K> expect_keys({1,        2,             3});
   //                                        {0, 3, 6,  1, 4, 5, 9,    2, 7, 8}
-  fixed_width_column_wrapper<R> expect_vals({3.,       sqrt(131./12), sqrt(31./3)}, all_valid());
+  fixed_width_column_wrapper<R> expect_vals({3.,       sqrt(131./12), sqrt(31./3)}, iterator_no_null());
   // clang-format on
 
   test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_std_aggregation());
diff --git a/cpp/tests/groupby/sum_of_squares_tests.cpp b/cpp/tests/groupby/sum_of_squares_tests.cpp
index 24306a51056..84843f860e2 100644
--- a/cpp/tests/groupby/sum_of_squares_tests.cpp
+++ b/cpp/tests/groupby/sum_of_squares_tests.cpp
@@ -18,6 +18,7 @@
 
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/detail/aggregation/aggregation.hpp>
@@ -44,7 +45,7 @@ TYPED_TEST(groupby_sum_of_squares_test, basic)
   //  { 1, 1, 1,  2, 2, 2, 2,  3, 3, 3}
   fixed_width_column_wrapper<K> expect_keys{1, 2, 3};
   //  { 0, 3, 6,  1, 4, 5, 9,  2, 7, 8}
-  fixed_width_column_wrapper<R> expect_vals({45., 123., 117.}, all_valid());
+  fixed_width_column_wrapper<R> expect_vals({45., 123., 117.}, iterator_no_null());
 
   auto agg = cudf::make_sum_of_squares_aggregation();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
@@ -70,7 +71,7 @@ TYPED_TEST(groupby_sum_of_squares_test, zero_valid_keys)
   using V = TypeParam;
   using R = cudf::detail::target_type_t<V, aggregation::SUM_OF_SQUARES>;
 
-  fixed_width_column_wrapper<K> keys({1, 2, 3}, all_null());
+  fixed_width_column_wrapper<K> keys({1, 2, 3}, iterator_all_nulls());
   fixed_width_column_wrapper<V> vals{3, 4, 5};
 
   fixed_width_column_wrapper<K> expect_keys{};
@@ -86,10 +87,10 @@ TYPED_TEST(groupby_sum_of_squares_test, zero_valid_values)
   using R = cudf::detail::target_type_t<V, aggregation::SUM_OF_SQUARES>;
 
   fixed_width_column_wrapper<K> keys{1, 1, 1};
-  fixed_width_column_wrapper<V> vals({3, 4, 5}, all_null());
+  fixed_width_column_wrapper<V> vals({3, 4, 5}, iterator_all_nulls());
 
   fixed_width_column_wrapper<K> expect_keys{1};
-  fixed_width_column_wrapper<R> expect_vals({0}, all_null());
+  fixed_width_column_wrapper<R> expect_vals({0}, iterator_all_nulls());
 
   auto agg = cudf::make_sum_of_squares_aggregation();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
@@ -106,7 +107,7 @@ TYPED_TEST(groupby_sum_of_squares_test, null_keys_and_values)
                                      {0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0});
 
   //  { 1, 1,     2, 2, 2,   3, 3,    4}
-  fixed_width_column_wrapper<K> expect_keys({1, 2, 3, 4}, all_valid());
+  fixed_width_column_wrapper<K> expect_keys({1, 2, 3, 4}, iterator_no_null());
   //  { 3, 6,     1, 4, 9,   2, 8,    3}
   fixed_width_column_wrapper<R> expect_vals({45., 98., 68., 9.}, {1, 1, 1, 0});
 
@@ -126,7 +127,7 @@ TYPED_TEST(groupby_sum_of_squares_test, dictionary)
   //                                        {1, 1, 1,  2, 2, 2, 2,  3, 3, 3}
   fixed_width_column_wrapper<K> expect_keys({1,        2,           3      });
   //                                        {0, 3, 6,  1, 4, 5, 9,  2, 7, 8}
-  fixed_width_column_wrapper<R> expect_vals({45.,       123.,       117.   }, all_valid());
+  fixed_width_column_wrapper<R> expect_vals({45.,       123.,       117.   }, iterator_no_null());
   // clang-format on
 
   test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_sum_of_squares_aggregation());
diff --git a/cpp/tests/groupby/sum_scan_tests.cpp b/cpp/tests/groupby/sum_scan_tests.cpp
index 9f6c21462b3..52df342bbd1 100644
--- a/cpp/tests/groupby/sum_scan_tests.cpp
+++ b/cpp/tests/groupby/sum_scan_tests.cpp
@@ -18,6 +18,7 @@
 
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/detail/aggregation/aggregation.hpp>
@@ -81,7 +82,7 @@ TYPED_TEST(groupby_sum_scan_test, zero_valid_keys)
   using result_wrapper = typename TestFixture::result_wrapper;
 
   // clang-format off
-  key_wrapper keys({1, 2, 3}, all_null());
+  key_wrapper keys({1, 2, 3}, iterator_all_nulls());
   value_wrapper vals{3, 4, 5};
 
   key_wrapper expect_keys{};
@@ -99,10 +100,10 @@ TYPED_TEST(groupby_sum_scan_test, zero_valid_values)
 
   // clang-format off
   key_wrapper keys   {1, 1, 1};
-  value_wrapper vals({3, 4, 5}, all_null());
+  value_wrapper vals({3, 4, 5}, iterator_all_nulls());
 
   key_wrapper expect_keys    {1, 1, 1};
-  result_wrapper expect_vals({3, 4, 5}, all_null());
+  result_wrapper expect_vals({3, 4, 5}, iterator_all_nulls());
   // clang-format on
 
   auto agg = cudf::make_sum_aggregation();
@@ -119,7 +120,7 @@ TYPED_TEST(groupby_sum_scan_test, null_keys_and_values)
   value_wrapper vals({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4}, {0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0});
 
   //                         { 1, 1, 1, 2, 2,  2,  2, 3, *, 3, 4};
-  key_wrapper expect_keys(   { 1, 1, 1, 2, 2,  2,  2, 3,    3, 4}, all_valid());
+  key_wrapper expect_keys(   { 1, 1, 1, 2, 2,  2,  2, 3,    3, 4}, iterator_no_null());
                           // { -, 3, 6, 1, 4,  -,  9, 2, _, 8, -}
   result_wrapper expect_vals({-1, 3, 9, 1, 5, -1, 14, 2,   10, -1},
                              { 0, 1, 1, 1, 1,  0,  1, 1,    1, 0});
diff --git a/cpp/tests/groupby/sum_tests.cpp b/cpp/tests/groupby/sum_tests.cpp
index 90544dd0db6..88d8ded324a 100644
--- a/cpp/tests/groupby/sum_tests.cpp
+++ b/cpp/tests/groupby/sum_tests.cpp
@@ -18,6 +18,7 @@
 
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/detail/aggregation/aggregation.hpp>
@@ -76,7 +77,7 @@ TYPED_TEST(groupby_sum_test, zero_valid_keys)
   using V = TypeParam;
   using R = cudf::detail::target_type_t<V, aggregation::SUM>;
 
-  fixed_width_column_wrapper<K> keys({1, 2, 3}, all_null());
+  fixed_width_column_wrapper<K> keys({1, 2, 3}, iterator_all_nulls());
   fixed_width_column_wrapper<V> vals{3, 4, 5};
 
   fixed_width_column_wrapper<K> expect_keys{};
@@ -95,10 +96,10 @@ TYPED_TEST(groupby_sum_test, zero_valid_values)
   using R = cudf::detail::target_type_t<V, aggregation::SUM>;
 
   fixed_width_column_wrapper<K> keys{1, 1, 1};
-  fixed_width_column_wrapper<V> vals({3, 4, 5}, all_null());
+  fixed_width_column_wrapper<V> vals({3, 4, 5}, iterator_all_nulls());
 
   fixed_width_column_wrapper<K> expect_keys{1};
-  fixed_width_column_wrapper<R> expect_vals({0}, all_null());
+  fixed_width_column_wrapper<R> expect_vals({0}, iterator_all_nulls());
 
   auto agg = cudf::make_sum_aggregation();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
@@ -118,7 +119,7 @@ TYPED_TEST(groupby_sum_test, null_keys_and_values)
                                      {0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0});
 
   //  { 1, 1,     2, 2, 2,   3, 3,    4}
-  fixed_width_column_wrapper<K> expect_keys({1, 2, 3, 4}, all_valid());
+  fixed_width_column_wrapper<K> expect_keys({1, 2, 3, 4}, iterator_no_null());
   //  { 3, 6,     1, 4, 9,   2, 8,    -}
   fixed_width_column_wrapper<R> expect_vals({9, 14, 10, 0}, {1, 1, 1, 0});
 
diff --git a/cpp/tests/groupby/var_tests.cpp b/cpp/tests/groupby/var_tests.cpp
index 5835d850b8c..8b2e13ad0f5 100644
--- a/cpp/tests/groupby/var_tests.cpp
+++ b/cpp/tests/groupby/var_tests.cpp
@@ -20,6 +20,7 @@
 
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/detail/aggregation/aggregation.hpp>
@@ -47,7 +48,7 @@ TYPED_TEST(groupby_var_test, basic)
   //                                       {1, 1, 1,  2, 2, 2, 2,  3, 3, 3}
   fixed_width_column_wrapper<K> expect_keys{1,        2,           3};
   //                                       {0, 3, 6,  1, 4, 5, 9,  2, 7, 8}
-  fixed_width_column_wrapper<R> expect_vals({9.,      131. / 12,   31. / 3}, all_valid());
+  fixed_width_column_wrapper<R> expect_vals({9.,      131. / 12,   31. / 3}, iterator_no_null());
   // clang-format on
 
   auto agg = cudf::make_variance_aggregation();
@@ -74,7 +75,7 @@ TYPED_TEST(groupby_var_test, zero_valid_keys)
   using V = TypeParam;
   using R = cudf::detail::target_type_t<V, aggregation::VARIANCE>;
 
-  fixed_width_column_wrapper<K> keys({1, 2, 3}, all_null());
+  fixed_width_column_wrapper<K> keys({1, 2, 3}, iterator_all_nulls());
   fixed_width_column_wrapper<V> vals{3, 4, 5};
 
   fixed_width_column_wrapper<K> expect_keys{};
@@ -90,10 +91,10 @@ TYPED_TEST(groupby_var_test, zero_valid_values)
   using R = cudf::detail::target_type_t<V, aggregation::VARIANCE>;
 
   fixed_width_column_wrapper<K> keys{1, 1, 1};
-  fixed_width_column_wrapper<V> vals({3, 4, 5}, all_null());
+  fixed_width_column_wrapper<V> vals({3, 4, 5}, iterator_all_nulls());
 
   fixed_width_column_wrapper<K> expect_keys{1};
-  fixed_width_column_wrapper<R> expect_vals({0}, all_null());
+  fixed_width_column_wrapper<R> expect_vals({0}, iterator_all_nulls());
 
   auto agg = cudf::make_variance_aggregation();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
@@ -111,7 +112,7 @@ TYPED_TEST(groupby_var_test, null_keys_and_values)
 
   // clang-format off
   //                                        {1, 1,     2, 2, 2,   3, 3,    4}
-  fixed_width_column_wrapper<K> expect_keys({1,        2,         3,       4}, all_valid());
+  fixed_width_column_wrapper<K> expect_keys({1,        2,         3,       4}, iterator_no_null());
   //                                        {3, 6,     1, 4, 9,   2, 8,    3}
   fixed_width_column_wrapper<R> expect_vals({4.5,      49. / 3,   18.,     0.}, {1, 1, 1, 0});
   // clang-format on
@@ -132,7 +133,7 @@ TYPED_TEST(groupby_var_test, ddof_non_default)
 
   // clang-format off
   //                                        { 1, 1,     2, 2, 2,   3, 3,    4}
-  fixed_width_column_wrapper<K> expect_keys({1,         2,         3,       4}, all_valid());
+  fixed_width_column_wrapper<K> expect_keys({1,         2,         3,       4}, iterator_no_null());
   //                                        { 3, 6,     1, 4, 9,   2, 8,    3}
   fixed_width_column_wrapper<R> expect_vals({0.,        98. / 3,   0.,      0.},
                                             {0,         1,         0,       0});
@@ -154,7 +155,7 @@ TYPED_TEST(groupby_var_test, dictionary)
   //                                        {1, 1, 1,  2, 2, 2, 2,  3, 3, 3}
   fixed_width_column_wrapper<K> expect_keys({1,        2,           3      });
   //                                        {0, 3, 6,  1, 4, 5, 9,  2, 7, 8}
-  fixed_width_column_wrapper<R> expect_vals({9.,      131./12,      31./3  }, all_valid());
+  fixed_width_column_wrapper<R> expect_vals({9.,      131./12,      31./3  }, iterator_no_null());
   // clang-format on
 
   test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_variance_aggregation());
diff --git a/cpp/tests/replace/replace_nulls_tests.cpp b/cpp/tests/replace/replace_nulls_tests.cpp
index cd19b0a70f3..3f0f546f55e 100644
--- a/cpp/tests/replace/replace_nulls_tests.cpp
+++ b/cpp/tests/replace/replace_nulls_tests.cpp
@@ -31,6 +31,7 @@
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/cudf_gtest.hpp>
+#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
 struct ReplaceErrorTest : public cudf::test::BaseFixture {
@@ -191,7 +192,7 @@ TEST_F(ReplaceNullsPolicyStringTest, PrecedingFill)
                                            {1, 0, 0, 1, 1, 0, 1});
 
   cudf::test::strings_column_wrapper expected({"head", "head", "head", "mid", "mid", "mid", "tail"},
-                                              cudf::test::all_valid());
+                                              cudf::test::iterator_no_null());
 
   auto result = cudf::replace_nulls(input, cudf::replace_policy::PRECEDING);
 
@@ -204,7 +205,7 @@ TEST_F(ReplaceNullsPolicyStringTest, FollowingFill)
                                            {1, 0, 0, 1, 1, 0, 1});
 
   cudf::test::strings_column_wrapper expected({"head", "mid", "mid", "mid", "mid", "tail", "tail"},
-                                              cudf::test::all_valid());
+                                              cudf::test::iterator_no_null());
 
   auto result = cudf::replace_nulls(input, cudf::replace_policy::FOLLOWING);
 
@@ -388,7 +389,7 @@ TYPED_TEST(ReplaceNullsPolicyTest, PrecedingFill)
   TestReplaceNullsWithPolicy(
     cudf::test::fixed_width_column_wrapper<TypeParam>(col.begin(), col.end(), mask.begin()),
     cudf::test::fixed_width_column_wrapper<TypeParam>(
-      expect_col.begin(), expect_col.end(), cudf::test::all_valid()),
+      expect_col.begin(), expect_col.end(), cudf::test::iterator_no_null()),
     cudf::replace_policy::PRECEDING);
 }
 
@@ -402,7 +403,7 @@ TYPED_TEST(ReplaceNullsPolicyTest, FollowingFill)
   TestReplaceNullsWithPolicy(
     cudf::test::fixed_width_column_wrapper<TypeParam>(col.begin(), col.end(), mask.begin()),
     cudf::test::fixed_width_column_wrapper<TypeParam>(
-      expect_col.begin(), expect_col.end(), cudf::test::all_valid()),
+      expect_col.begin(), expect_col.end(), cudf::test::iterator_no_null()),
     cudf::replace_policy::FOLLOWING);
 }
 
@@ -659,7 +660,8 @@ TEST_F(ReplaceNullsPolicyDictionaryTest, PrecedingFill)
   auto input = cudf::dictionary::encode(input_w);
 
   cudf::test::strings_column_wrapper expected_w(
-    {"head", "head", "head", "mid1", "mid2", "tail", "tail", "tail"}, cudf::test::all_valid());
+    {"head", "head", "head", "mid1", "mid2", "tail", "tail", "tail"},
+    cudf::test::iterator_no_null());
   auto expected = cudf::dictionary::encode(expected_w);
 
   auto result = cudf::replace_nulls(*input, cudf::replace_policy::PRECEDING);
@@ -674,7 +676,8 @@ TEST_F(ReplaceNullsPolicyDictionaryTest, FollowingFill)
   auto input = cudf::dictionary::encode(input_w);
 
   cudf::test::strings_column_wrapper expected_w(
-    {"head", "mid1", "mid1", "mid1", "mid2", "tail", "tail", "tail"}, cudf::test::all_valid());
+    {"head", "mid1", "mid1", "mid1", "mid2", "tail", "tail", "tail"},
+    cudf::test::iterator_no_null());
   auto expected = cudf::dictionary::encode(expected_w);
 
   auto result = cudf::replace_nulls(*input, cudf::replace_policy::FOLLOWING);

From ae8ee8a060f7956cd55cbf060cab4de400efcee5 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Mon, 7 Jun 2021 15:26:24 -0700
Subject: [PATCH 11/15] Refactor `scatter` for list columns (#8255)

This PR refactors `scatter` for `LIST` type columns. Previously there were nested `for_each_n` when constructing child columns. The outer loop loops over the rows and the inner loops over the elements of each row. We can replace these loops with a single `transform` because we already have the offsets information of the column to construct.

For each element, we first lookup the `unbound_list_view` it belongs to via binary searching the offset vector. Then the corresponding element to copy from can be retrieved by dereferencing bounded `list_view` with the proper intra index.

Struct type refactor is different. Currently the implementation wraps every child in a lists column and dispatch to the list type specialization. This is fine, but the wrapping process now deep copies the list offsets and child column for dispatching. We can simplify it by just wrapping it with a view.

Since `scatter.cuh` is included in many other files, separating scatter implementation detail can help reducing compilation time during refactoring the code. Most helper function is moved into `scatter_helper.cu`.

Benchmarking code for scattering lists is added. Benchmark snapshot is below:
```
Benchmark                                                                      Time             CPU      Time Old      Time New       CPU Old       CPU New
-----------------------------------------------------------------------------------------------------------------------------------------------------------
ScatterLists/double_type_colesce_o/1024/64/manual_time                      -0.1073         -0.0926        110648         98781        129731        117724
ScatterLists/double_type_colesce_o/4096/64/manual_time                      -0.1177         -0.1015        113393        100045        132412        118971
ScatterLists/double_type_colesce_o/32768/64/manual_time                     -0.3785         -0.3391        167288        103962        185599        122663
ScatterLists/double_type_colesce_o/262144/64/manual_time                    -0.3175         -0.2834        171123        116785        188191        134865
ScatterLists/double_type_colesce_o/2097152/64/manual_time                   -0.2581         -0.2426        270225        200472        290363        219934
ScatterLists/double_type_colesce_o/16777216/64/manual_time                  -0.8464         -0.8438       6205089        953139       6224867        972548
ScatterLists/double_type_colesce_o/33554432/64/manual_time                  -0.8437         -0.8423      12087712       1889483      12107066       1909170
ScatterLists/double_type_colesce_o/1024/512/manual_time                     -0.3487         -0.3111        150169         97810        169463        116736
ScatterLists/double_type_colesce_o/4096/512/manual_time                     -0.3499         -0.3116        151978         98794        170918        117661
ScatterLists/double_type_colesce_o/32768/512/manual_time                    -0.4337         -0.3901        196663        111364        215048        131162
ScatterLists/double_type_colesce_o/262144/512/manual_time                   -0.8083         -0.7844        590691        113251        607891        131089
ScatterLists/double_type_colesce_o/2097152/512/manual_time                  -0.7018         -0.6815        641149        191192        661107        210559
ScatterLists/double_type_colesce_o/16777216/512/manual_time                 -0.6893         -0.6842       2581320        802057       2601542        821602
ScatterLists/double_type_colesce_o/33554432/512/manual_time                 -0.8277         -0.8259       9150244       1576769       9169846       1596137
ScatterLists/double_type_colesce_o/1024/2048/manual_time                    -0.6584         -0.6178        284006         97008        303179        115869
ScatterLists/double_type_colesce_o/4096/2048/manual_time                    -0.6648         -0.6250        289209         96934        308413        115647
ScatterLists/double_type_colesce_o/32768/2048/manual_time                   -0.7433         -0.7089        386115         99120        404566        117774
ScatterLists/double_type_colesce_o/262144/2048/manual_time                  -0.8214         -0.7984        611876        109305        629110        126803
ScatterLists/double_type_colesce_o/2097152/2048/manual_time                 -0.9107         -0.9024       2098263        187417       2118254        206798
ScatterLists/double_type_colesce_o/16777216/2048/manual_time                -0.6869         -0.6816       2527109        791306       2546819        810805
ScatterLists/double_type_colesce_o/33554432/2048/manual_time                -0.5102         -0.5070       3018595       1478458       3038315       1497923
```

Authors:
  - Michael Wang (https://github.com/isVoid)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)
  - AJ Schmidt (https://github.com/ajschmidt8)
  - David Wendt (https://github.com/davidwendt)
  - MithunR (https://github.com/mythrocks)

URL: https://github.com/rapidsai/cudf/pull/8255
---
 conda/recipes/libcudf/meta.yaml               |   1 +
 cpp/CMakeLists.txt                            |   1 +
 cpp/benchmarks/CMakeLists.txt                 |   4 +
 .../lists/copying/scatter_lists_benchmark.cu  | 131 ++++
 cpp/include/cudf/lists/detail/scatter.cuh     | 627 +-----------------
 .../cudf/lists/detail/scatter_helper.cuh      | 148 +++++
 cpp/src/lists/copying/scatter_helper.cu       | 511 ++++++++++++++
 7 files changed, 805 insertions(+), 618 deletions(-)
 create mode 100644 cpp/benchmarks/lists/copying/scatter_lists_benchmark.cu
 create mode 100644 cpp/include/cudf/lists/detail/scatter_helper.cuh
 create mode 100644 cpp/src/lists/copying/scatter_helper.cu

diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index dc41c439d27..139ceb1d6af 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -141,6 +141,7 @@ test:
     - test -f $PREFIX/include/cudf/lists/detail/drop_list_duplicates.hpp
     - test -f $PREFIX/include/cudf/lists/detail/interleave_columns.hpp
     - test -f $PREFIX/include/cudf/lists/detail/sorting.hpp
+    - test -f $PREFIX/include/cudf/lists/detail/scatter_helper.cuh
     - test -f $PREFIX/include/cudf/lists/combine.hpp
     - test -f $PREFIX/include/cudf/lists/count_elements.hpp
     - test -f $PREFIX/include/cudf/lists/explode.hpp
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 015c856d272..4b2e81edb9d 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -274,6 +274,7 @@ add_library(cudf
     src/lists/copying/copying.cu
     src/lists/copying/gather.cu
     src/lists/copying/segmented_gather.cu
+    src/lists/copying/scatter_helper.cu
     src/lists/count_elements.cu
     src/lists/drop_list_duplicates.cu
     src/lists/explode.cu
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 41b6ddcc2df..e8ccb24f44c 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -67,6 +67,10 @@ ConfigureBench(GATHER_BENCH copying/gather_benchmark.cu)
 # - scatter benchmark -----------------------------------------------------------------------------
 ConfigureBench(SCATTER_BENCH copying/scatter_benchmark.cu)
 
+###################################################################################################
+# - lists scatter benchmark -----------------------------------------------------------------------
+ConfigureBench(SCATTER_LISTS_BENCH lists/copying/scatter_lists_benchmark.cu)
+
 ###################################################################################################
 # - contiguous_split benchmark  -------------------------------------------------------------------
 ConfigureBench(CONTIGUOUS_SPLIT_BENCH copying/contiguous_split_benchmark.cu)
diff --git a/cpp/benchmarks/lists/copying/scatter_lists_benchmark.cu b/cpp/benchmarks/lists/copying/scatter_lists_benchmark.cu
new file mode 100644
index 00000000000..49007fda7a3
--- /dev/null
+++ b/cpp/benchmarks/lists/copying/scatter_lists_benchmark.cu
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
+
+#include <cudf/column/column_factories.hpp>
+#include <cudf/copying.hpp>
+#include <cudf/null_mask.hpp>
+#include <cudf/types.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+
+#include <thrust/random.h>
+#include <thrust/sequence.h>
+#include <thrust/shuffle.h>
+
+#include <cmath>
+
+namespace cudf {
+
+class ScatterLists : public cudf::benchmark {
+};
+
+template <class TypeParam, bool coalesce>
+void BM_lists_scatter(::benchmark::State& state)
+{
+  auto stream = rmm::cuda_stream_default;
+  auto mr     = rmm::mr::get_current_device_resource();
+
+  const size_type base_size{(size_type)state.range(0)};
+  const size_type num_elements_per_row{(size_type)state.range(1)};
+  const size_type num_rows = (size_type)ceil(double(base_size) / num_elements_per_row);
+
+  auto source_base_col = make_fixed_width_column(
+    data_type{type_to_id<TypeParam>()}, base_size, mask_state::UNALLOCATED, stream, mr);
+  auto target_base_col = make_fixed_width_column(
+    data_type{type_to_id<TypeParam>()}, base_size, mask_state::UNALLOCATED, stream, mr);
+  thrust::sequence(rmm::exec_policy(stream),
+                   source_base_col->mutable_view().begin<TypeParam>(),
+                   source_base_col->mutable_view().end<TypeParam>());
+  thrust::sequence(rmm::exec_policy(stream),
+                   target_base_col->mutable_view().begin<TypeParam>(),
+                   target_base_col->mutable_view().end<TypeParam>());
+
+  auto source_offsets = make_fixed_width_column(
+    data_type{type_to_id<offset_type>()}, num_rows + 1, mask_state::UNALLOCATED, stream, mr);
+  auto target_offsets = make_fixed_width_column(
+    data_type{type_to_id<offset_type>()}, num_rows + 1, mask_state::UNALLOCATED, stream, mr);
+
+  thrust::sequence(rmm::exec_policy(stream),
+                   source_offsets->mutable_view().begin<offset_type>(),
+                   source_offsets->mutable_view().end<offset_type>(),
+                   0,
+                   num_elements_per_row);
+  thrust::sequence(rmm::exec_policy(stream),
+                   target_offsets->mutable_view().begin<offset_type>(),
+                   target_offsets->mutable_view().end<offset_type>(),
+                   0,
+                   num_elements_per_row);
+
+  auto source = make_lists_column(num_rows,
+                                  std::move(source_offsets),
+                                  std::move(source_base_col),
+                                  0,
+                                  cudf::create_null_mask(num_rows, mask_state::UNALLOCATED),
+                                  stream,
+                                  mr);
+  auto target = make_lists_column(num_rows,
+                                  std::move(target_offsets),
+                                  std::move(target_base_col),
+                                  0,
+                                  cudf::create_null_mask(num_rows, mask_state::UNALLOCATED),
+                                  stream,
+                                  mr);
+
+  auto scatter_map = make_fixed_width_column(
+    data_type{type_to_id<size_type>()}, num_rows, mask_state::UNALLOCATED, stream, mr);
+  auto m_scatter_map = scatter_map->mutable_view();
+  thrust::sequence(rmm::exec_policy(stream),
+                   m_scatter_map.begin<size_type>(),
+                   m_scatter_map.end<size_type>(),
+                   num_rows - 1,
+                   -1);
+
+  if (not coalesce) {
+    thrust::default_random_engine g;
+    thrust::shuffle(rmm::exec_policy(stream),
+                    m_scatter_map.begin<size_type>(),
+                    m_scatter_map.begin<size_type>(),
+                    g);
+  }
+
+  for (auto _ : state) {
+    cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
+    scatter(table_view{{*source}}, *scatter_map, table_view{{*target}}, false, mr);
+  }
+
+  state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * state.range(0) * 2 *
+                          sizeof(TypeParam));
+}
+
+#define SBM_BENCHMARK_DEFINE(name, type, coalesce)                                \
+  BENCHMARK_DEFINE_F(ScatterLists, name)(::benchmark::State & state)              \
+  {                                                                               \
+    BM_lists_scatter<type, coalesce>(state);                                      \
+  }                                                                               \
+  BENCHMARK_REGISTER_F(ScatterLists, name)                                        \
+    ->RangeMultiplier(8)                                                          \
+    ->Ranges({{1 << 10, 1 << 25}, {64, 2048}}) /* 1K-1B rows, 64-2048 elements */ \
+    ->UseManualTime();
+
+SBM_BENCHMARK_DEFINE(double_type_colesce_o, double, true);
+SBM_BENCHMARK_DEFINE(double_type_colesce_x, double, false);
+
+}  // namespace cudf
diff --git a/cpp/include/cudf/lists/detail/scatter.cuh b/cpp/include/cudf/lists/detail/scatter.cuh
index dac67545748..a440e456e25 100644
--- a/cpp/include/cudf/lists/detail/scatter.cuh
+++ b/cpp/include/cudf/lists/detail/scatter.cuh
@@ -19,131 +19,22 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/copying.hpp>
-#include <cudf/detail/get_value.cuh>
-#include <cudf/detail/null_mask.hpp>
-#include <cudf/detail/valid_if.cuh>
-#include <cudf/lists/detail/copying.hpp>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/lists/detail/scatter_helper.cuh>
 #include <cudf/lists/list_device_view.cuh>
 #include <cudf/null_mask.hpp>
 #include <cudf/strings/detail/utilities.cuh>
-#include <cudf/strings/detail/utilities.hpp>
-#include <cudf/strings/strings_column_view.hpp>
 #include <cudf/types.hpp>
 
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
-#include <thrust/binary_search.h>
-#include <thrust/iterator/constant_iterator.h>
-#include <thrust/iterator/counting_iterator.h>
-
 #include <cinttypes>
 
 namespace cudf {
 namespace lists {
 namespace detail {
 
-namespace {
-
-/**
- * @brief Holder for a list row's positional information, without
- *        also holding a reference to the list column.
- *
- * Analogous to the list_view, this class is default constructable,
- * and can thus be stored in rmm::device_uvector. It is used to represent
- * the results of a `scatter()` operation; a device_uvector may hold
- * several instances of unbound_list_view, each with a flag indicating
- * whether it came from the scatter source or target. Each instance
- * may later be "bound" to the appropriate source/target column, to
- * reconstruct the list_view.
- */
-struct unbound_list_view {
-  /**
-   * @brief Flag type, indicating whether this list row originated from
-   *        the source or target column, in `scatter()`.
-   */
-  enum class label_type : bool { SOURCE, TARGET };
-
-  using lists_column_device_view = cudf::detail::lists_column_device_view;
-  using list_device_view         = cudf::list_device_view;
-
-  unbound_list_view()                         = default;
-  unbound_list_view(unbound_list_view const&) = default;
-  unbound_list_view(unbound_list_view&&)      = default;
-  unbound_list_view& operator=(unbound_list_view const&) = default;
-  unbound_list_view& operator=(unbound_list_view&&) = default;
-
-  /**
-   * @brief __device__ Constructor, for use from `scatter()`.
-   *
-   * @param scatter_source_label Whether the row came from source or target
-   * @param lists_column The actual source/target lists column
-   * @param row_index Index of the row in lists_column that this instance represents
-   */
-  CUDA_DEVICE_CALLABLE unbound_list_view(label_type scatter_source_label,
-                                         cudf::detail::lists_column_device_view const& lists_column,
-                                         size_type const& row_index)
-    : _label{scatter_source_label}, _row_index{row_index}
-  {
-    _size = list_device_view{lists_column, row_index}.size();
-  }
-
-  /**
-   * @brief __device__ Constructor, for use when constructing the child column
-   *        of a scattered list column
-   *
-   * @param scatter_source_label Whether the row came from source or target
-   * @param row_index Index of the row that this instance represents in the source/target column
-   * @param size The number of elements in this list row
-   */
-  CUDA_DEVICE_CALLABLE unbound_list_view(label_type scatter_source_label,
-                                         size_type const& row_index,
-                                         size_type const& size)
-    : _label{scatter_source_label}, _row_index{row_index}, _size{size}
-  {
-  }
-
-  /**
-   * @brief Returns number of elements in this list row.
-   */
-  CUDA_DEVICE_CALLABLE size_type size() const { return _size; }
-
-  /**
-   * @brief Returns whether this row came from the `scatter()` source or target
-   */
-  CUDA_DEVICE_CALLABLE label_type label() const { return _label; }
-
-  /**
-   * @brief Returns the index in the source/target column
-   */
-  CUDA_DEVICE_CALLABLE size_type row_index() const { return _row_index; }
-
-  /**
-   * @brief Binds to source/target column (depending on SOURCE/TARGET labels),
-   *        to produce a bound list_view.
-   *
-   * @param scatter_source Source column for the scatter operation
-   * @param scatter_target Target column for the scatter operation
-   * @return A (bound) list_view for the row that this object represents
-   */
-  CUDA_DEVICE_CALLABLE list_device_view
-  bind_to_column(lists_column_device_view const& scatter_source,
-                 lists_column_device_view const& scatter_target) const
-  {
-    return list_device_view(_label == label_type::SOURCE ? scatter_source : scatter_target,
-                            _row_index);
-  }
-
- private:
-  // Note: Cannot store reference to list column, because of storage in device_uvector.
-  // Only keep track of whether this list row came from the source or target of scatter.
-
-  label_type _label{
-    label_type::SOURCE};   // Whether this list row came from the scatter source or target.
-  size_type _row_index{};  // Row index in the Lists column.
-  size_type _size{};       // Number of elements in *this* list row.
-};
-
 template <typename IndexIterator>
 rmm::device_uvector<unbound_list_view> list_vector_from_column(
   unbound_list_view::label_type label,
@@ -168,503 +59,6 @@ rmm::device_uvector<unbound_list_view> list_vector_from_column(
   return vector;
 }
 
-/**
- * @brief Constructs null mask for a scattered list's child column
- *
- * @param parent_list_vector Vector of unbound_list_view, for parent lists column
- * @param parent_list_offsets List column offsets for parent lists column
- * @param source_lists Source lists column for scatter operation
- * @param target_lists Target lists column for scatter operation
- * @param num_child_rows Number of rows in child column
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource used to allocate child column's null mask
- * @return std::pair<rmm::device_buffer, size_type> Child column's null mask and null row count
- */
-std::pair<rmm::device_buffer, size_type> construct_child_nullmask(
-  rmm::device_uvector<unbound_list_view> const& parent_list_vector,
-  column_view const& parent_list_offsets,
-  cudf::detail::lists_column_device_view const& source_lists,
-  cudf::detail::lists_column_device_view const& target_lists,
-  size_type num_child_rows,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
-{
-  auto is_valid_predicate = [d_list_vector  = parent_list_vector.begin(),
-                             d_offsets      = parent_list_offsets.template data<size_type>(),
-                             d_offsets_size = parent_list_offsets.size(),
-                             source_lists,
-                             target_lists] __device__(auto const& i) {
-    auto list_start =
-      thrust::upper_bound(thrust::seq, d_offsets, d_offsets + d_offsets_size, i) - 1;
-    auto list_index    = list_start - d_offsets;
-    auto element_index = i - *list_start;
-
-    auto list_row = d_list_vector[list_index];
-    return !list_row.bind_to_column(source_lists, target_lists).is_null(element_index);
-  };
-
-  return cudf::detail::valid_if(thrust::make_counting_iterator<size_type>(0),
-                                thrust::make_counting_iterator<size_type>(num_child_rows),
-                                is_valid_predicate,
-                                stream,
-                                mr);
-}
-
-/**
- * @brief (type_dispatch endpoint) Functor that constructs the child column result
- *        of `scatter()`ing a list column.
- *
- * The protocol is as follows:
- *
- * Inputs:
- *  1. list_vector:  A device_uvector of unbound_list_view, with each element
- *                   indicating the position, size, and which column the list
- *                   row came from.
- *  2. list_offsets: The offsets column for the (outer) lists column, each offset
- *                   marking the beginning of a list row.
- *  3. source_list:  The lists-column that is the source of the scatter().
- *  4. target_list:  The lists-column that is the target of the scatter().
- *
- * Output: A (possibly non-list) child column, which may be used in combination
- *         with list_offsets to fully construct the outer list.
- *
- * Example:
- *
- * Consider the following scatter operation of two `list<int>` columns:
- *
- * 1. Source:      [{9,9,9,9}, {8,8,8}], i.e.
- *    a. Child:    [9,9,9,9,8,8,8]
- *    b. Offsets:  [0,      4,    7]
- *
- * 2. Target:      [{1,1}, {2,2}, {3,3}], i.e.
- *    a. Child:    [1,1,2,2,3,3]
- *    b. Offsets:  [0,  2,  4,  6]
- *
- * 3. Scatter-map: [2, 0]
- *
- * 4. Expected output: [{8,8,8}, {2,2}, {9,9,9,9}], i.e.
- *    a. Child:        [8,8,8,2,2,9,9,9,9]  <--- THIS
- *    b. Offsets:      [0,    3,  5,     9]
- *
- * `list_child_constructor` constructs the Expected Child column indicated above.
- *
- * `list_child_constructor` expects to be called with the `Source`/`Target`
- * lists columns, along with the following:
- *
- * 1. list_vector: [ S[1](3), T[1](2), S[0](4) ]
- *    Each unbound_list_view (e.g. S[1](3)) indicates:
- *      a. Which column the row is bound to: S == Source, T == Target
- *      b. The list index. E.g. S[1] indicates the 2nd list row of the Source column.
- *      c. The row size.   E.g. S[1](3) indicates that the row has 3 elements.
- *
- * 2. list_offsets: [0, 3, 5, 9]
- *    The caller may construct this with an `inclusive_scan()` on `list_vector`
- *    element sizes.
- */
-struct list_child_constructor {
- private:
-  /**
-   * @brief Determine whether the child column type is supported with scattering lists.
-   *
-   * @tparam T The data type of the child column of the list being scattered.
-   */
-  template <typename T>
-  struct is_supported_child_type {
-    static const bool value = cudf::is_fixed_width<T>() || std::is_same<T, string_view>::value ||
-                              std::is_same<T, list_view>::value ||
-                              std::is_same<T, struct_view>::value;
-  };
-
- public:
-  // SFINAE catch-all, for unsupported child column types.
-  template <typename T, typename... Args>
-  std::enable_if_t<!is_supported_child_type<T>::value, std::unique_ptr<column>> operator()(
-    Args&&... args)
-  {
-    CUDF_FAIL("list_child_constructor unsupported!");
-  }
-
-  /**
-   * @brief Implementation for fixed_width child column types.
-   */
-  template <typename T>
-  std::enable_if_t<cudf::is_fixed_width<T>(), std::unique_ptr<column>> operator()(
-    rmm::device_uvector<unbound_list_view> const& list_vector,
-    cudf::column_view const& list_offsets,
-    cudf::lists_column_view const& source_lists_column_view,
-    cudf::lists_column_view const& target_lists_column_view,
-    rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr) const
-  {
-    auto source_column_device_view =
-      column_device_view::create(source_lists_column_view.parent(), stream);
-    auto target_column_device_view =
-      column_device_view::create(target_lists_column_view.parent(), stream);
-    auto source_lists = cudf::detail::lists_column_device_view(*source_column_device_view);
-    auto target_lists = cudf::detail::lists_column_device_view(*target_column_device_view);
-
-    auto const num_child_rows{
-      cudf::detail::get_value<size_type>(list_offsets, list_offsets.size() - 1, stream)};
-
-    auto child_null_mask =
-      source_lists_column_view.child().nullable() || target_lists_column_view.child().nullable()
-        ? construct_child_nullmask(
-            list_vector, list_offsets, source_lists, target_lists, num_child_rows, stream, mr)
-        : std::make_pair(rmm::device_buffer{}, 0);
-
-    auto child_column = cudf::make_fixed_width_column(source_lists_column_view.child().type(),
-                                                      num_child_rows,
-                                                      std::move(child_null_mask.first),
-                                                      child_null_mask.second,
-                                                      stream,
-                                                      mr);
-
-    auto copy_child_values_for_list_index =
-      [d_scattered_lists = list_vector.begin(),  // unbound_list_view*
-       d_child_column    = child_column->mutable_view().data<T>(),
-       d_offsets         = list_offsets.template data<int32_t>(),
-       source_lists,
-       target_lists] __device__(auto const& row_index) {
-        auto const unbound_list_row = d_scattered_lists[row_index];
-        auto const actual_list_row  = unbound_list_row.bind_to_column(source_lists, target_lists);
-        auto const& bound_column =
-          (unbound_list_row.label() == unbound_list_view::label_type::SOURCE ? source_lists
-                                                                             : target_lists);
-        auto const list_begin_offset =
-          bound_column.offsets().template element<size_type>(unbound_list_row.row_index());
-        auto const list_end_offset =
-          bound_column.offsets().template element<size_type>(unbound_list_row.row_index() + 1);
-
-        // Copy all elements in this list row, to "appropriate" offset in child-column.
-        auto const destination_start_offset = d_offsets[row_index];
-        thrust::for_each_n(thrust::seq,
-                           thrust::make_counting_iterator<size_type>(0),
-                           actual_list_row.size(),
-                           [actual_list_row, d_child_column, destination_start_offset] __device__(
-                             auto const& list_element_index) {
-                             d_child_column[destination_start_offset + list_element_index] =
-                               actual_list_row.template element<T>(list_element_index);
-                           });
-      };
-
-    // For each list-row, copy underlying elements to the child column.
-    thrust::for_each_n(rmm::exec_policy(stream),
-                       thrust::make_counting_iterator<size_type>(0),
-                       list_vector.size(),
-                       copy_child_values_for_list_index);
-
-    return std::make_unique<column>(child_column->view());
-  }
-
-  /**
-   * @brief Implementation for list child columns that contain strings.
-   */
-  template <typename T>
-  std::enable_if_t<std::is_same<T, string_view>::value, std::unique_ptr<column>> operator()(
-    rmm::device_uvector<unbound_list_view> const& list_vector,
-    cudf::column_view const& list_offsets,
-    cudf::lists_column_view const& source_lists_column_view,
-    cudf::lists_column_view const& target_lists_column_view,
-    rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr) const
-  {
-    auto source_column_device_view =
-      column_device_view::create(source_lists_column_view.parent(), stream);
-    auto target_column_device_view =
-      column_device_view::create(target_lists_column_view.parent(), stream);
-    auto source_lists = cudf::detail::lists_column_device_view(*source_column_device_view);
-    auto target_lists = cudf::detail::lists_column_device_view(*target_column_device_view);
-
-    auto const num_child_rows{
-      cudf::detail::get_value<size_type>(list_offsets, list_offsets.size() - 1, stream)};
-
-    if (num_child_rows == 0) { return make_empty_column(data_type{type_id::STRING}); }
-
-    auto string_views = rmm::device_uvector<string_view>(num_child_rows, stream);
-
-    auto populate_string_views = [d_scattered_lists = list_vector.begin(),  // unbound_list_view*
-                                  d_list_offsets    = list_offsets.template data<int32_t>(),
-                                  d_string_views    = string_views.data(),
-                                  source_lists,
-                                  target_lists] __device__(auto const& row_index) {
-      auto unbound_list_view    = d_scattered_lists[row_index];
-      auto actual_list_row      = unbound_list_view.bind_to_column(source_lists, target_lists);
-      auto lists_column         = actual_list_row.get_column();
-      auto lists_offsets_column = lists_column.offsets();
-      auto child_strings_column = lists_column.child();
-      auto string_offsets_column =
-        child_strings_column.child(cudf::strings_column_view::offsets_column_index);
-      auto string_chars_column =
-        child_strings_column.child(cudf::strings_column_view::chars_column_index);
-
-      auto output_start_offset =
-        d_list_offsets[row_index];  // Offset in `string_views` at which string_views are
-                                    // to be written for this list row_index.
-      auto input_list_start =
-        lists_offsets_column.template element<int32_t>(unbound_list_view.row_index());
-
-      thrust::for_each_n(
-        thrust::seq,
-        thrust::make_counting_iterator<size_type>(0),
-        actual_list_row.size(),
-        [output_start_offset,
-         d_string_views,
-         input_list_start,
-         d_string_offsets = string_offsets_column.template data<int32_t>(),
-         d_string_chars =
-           string_chars_column.template data<char>()] __device__(auto const& string_idx) {
-          auto string_start_idx = d_string_offsets[input_list_start + string_idx];
-          auto string_end_idx   = d_string_offsets[input_list_start + string_idx + 1];
-
-          d_string_views[output_start_offset + string_idx] =
-            string_view{d_string_chars + string_start_idx, string_end_idx - string_start_idx};
-        });
-    };
-
-    thrust::for_each_n(rmm::exec_policy(stream),
-                       thrust::make_counting_iterator<size_type>(0),
-                       list_vector.size(),
-                       populate_string_views);
-
-    // string_views should now have been populated with source and target references.
-
-    auto string_offsets = cudf::strings::detail::child_offsets_from_string_iterator(
-      string_views.begin(), string_views.size(), stream, mr);
-
-    auto string_chars = cudf::strings::detail::child_chars_from_string_vector(
-      string_views, string_offsets->view(), stream, mr);
-    auto child_null_mask =
-      source_lists_column_view.child().nullable() || target_lists_column_view.child().nullable()
-        ? construct_child_nullmask(
-            list_vector, list_offsets, source_lists, target_lists, num_child_rows, stream, mr)
-        : std::make_pair(rmm::device_buffer{}, 0);
-
-    return cudf::make_strings_column(num_child_rows,
-                                     std::move(string_offsets),
-                                     std::move(string_chars),
-                                     child_null_mask.second,            // Null count.
-                                     std::move(child_null_mask.first),  // Null mask.
-                                     stream,
-                                     mr);
-  }
-
-  /**
-   * @brief (Recursively) Constructs a child column that is itself a list column.
-   */
-  template <typename T>
-  std::enable_if_t<std::is_same<T, list_view>::value, std::unique_ptr<column>> operator()(
-    rmm::device_uvector<unbound_list_view> const& list_vector,
-    cudf::column_view const& list_offsets,
-    cudf::lists_column_view const& source_lists_column_view,
-    cudf::lists_column_view const& target_lists_column_view,
-    rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr) const
-  {
-    auto source_column_device_view =
-      column_device_view::create(source_lists_column_view.parent(), stream);
-    auto target_column_device_view =
-      column_device_view::create(target_lists_column_view.parent(), stream);
-    auto source_lists = cudf::detail::lists_column_device_view(*source_column_device_view);
-    auto target_lists = cudf::detail::lists_column_device_view(*target_column_device_view);
-
-    auto const num_child_rows{
-      cudf::detail::get_value<size_type>(list_offsets, list_offsets.size() - 1, stream)};
-
-    if (num_child_rows == 0) {
-      // make an empty lists column using the input child type
-      return empty_like(source_lists_column_view.child());
-    }
-
-    auto child_list_views = rmm::device_uvector<unbound_list_view>(num_child_rows, stream, mr);
-
-    // Function to convert from parent list_device_view instances to child list_device_views.
-    // For instance, if a parent list_device_view has 3 elements, it should have 3 corresponding
-    // child list_device_view instances.
-    auto populate_child_list_views = [d_scattered_lists  = list_vector.begin(),
-                                      d_list_offsets     = list_offsets.template data<int32_t>(),
-                                      d_child_list_views = child_list_views.begin(),
-                                      source_lists,
-                                      target_lists] __device__(auto const& row_index) {
-      auto scattered_row        = d_scattered_lists[row_index];
-      auto label                = scattered_row.label();
-      auto bound_list_row       = scattered_row.bind_to_column(source_lists, target_lists);
-      auto lists_offsets_column = bound_list_row.get_column().offsets();
-
-      auto child_column  = bound_list_row.get_column().child();
-      auto child_offsets = child_column.child(cudf::lists_column_view::offsets_column_index);
-
-      // For lists row at row_index,
-      //   1. Number of entries in child_list_views == bound_list_row.size().
-      //   2. Offset of the first child list_view   == d_list_offsets[row_index].
-      auto output_start_offset = d_list_offsets[row_index];
-      auto input_list_start =
-        lists_offsets_column.template element<int32_t>(scattered_row.row_index());
-
-      thrust::for_each_n(
-        thrust::seq,
-        thrust::make_counting_iterator<size_type>(0),
-        bound_list_row.size(),
-        [input_list_start,
-         output_start_offset,
-         label,
-         d_child_list_views,
-         d_child_offsets =
-           child_offsets.template data<int32_t>()] __device__(auto const& child_list_index) {
-          auto child_start_idx = d_child_offsets[input_list_start + child_list_index];
-          auto child_end_idx   = d_child_offsets[input_list_start + child_list_index + 1];
-
-          d_child_list_views[output_start_offset + child_list_index] = unbound_list_view{
-            label, input_list_start + child_list_index, child_end_idx - child_start_idx};
-        });
-    };
-
-    thrust::for_each_n(rmm::exec_policy(stream),
-                       thrust::make_counting_iterator<size_type>(0),
-                       list_vector.size(),
-                       populate_child_list_views);
-
-    // child_list_views should now have been populated, with source and target references.
-
-    auto begin = thrust::make_transform_iterator(
-      child_list_views.begin(), [] __device__(auto const& row) { return row.size(); });
-
-    auto child_offsets = cudf::strings::detail::make_offsets_child_column(
-      begin, begin + child_list_views.size(), stream, mr);
-
-    auto child_column = cudf::type_dispatcher<dispatch_storage_type>(
-      source_lists_column_view.child().child(1).type(),
-      list_child_constructor{},
-      child_list_views,
-      child_offsets->view(),
-      cudf::lists_column_view(source_lists_column_view.child()),
-      cudf::lists_column_view(target_lists_column_view.child()),
-      stream,
-      mr);
-
-    auto child_null_mask =
-      source_lists_column_view.child().nullable() || target_lists_column_view.child().nullable()
-        ? construct_child_nullmask(
-            list_vector, list_offsets, source_lists, target_lists, num_child_rows, stream, mr)
-        : std::make_pair(rmm::device_buffer{}, 0);
-
-    return cudf::make_lists_column(num_child_rows,
-                                   std::move(child_offsets),
-                                   std::move(child_column),
-                                   child_null_mask.second,            // Null count
-                                   std::move(child_null_mask.first),  // Null mask
-                                   stream,
-                                   mr);
-  }
-
-  /**
-   * @brief (Recursively) constructs child columns that are structs.
-   */
-  template <typename T>
-  std::enable_if_t<std::is_same<T, struct_view>::value, std::unique_ptr<column>> operator()(
-    rmm::device_uvector<unbound_list_view> const& list_vector,
-    cudf::column_view const& list_offsets,
-    cudf::lists_column_view const& source_lists_column_view,
-    cudf::lists_column_view const& target_lists_column_view,
-    rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr) const
-  {
-    auto const source_column_device_view =
-      column_device_view::create(source_lists_column_view.parent(), stream);
-    auto const target_column_device_view =
-      column_device_view::create(target_lists_column_view.parent(), stream);
-    auto const source_lists = cudf::detail::lists_column_device_view(*source_column_device_view);
-    auto const target_lists = cudf::detail::lists_column_device_view(*target_column_device_view);
-
-    auto const source_structs = source_lists_column_view.child();
-    auto const target_structs = target_lists_column_view.child();
-
-    auto const num_child_rows{
-      cudf::detail::get_value<size_type>(list_offsets, list_offsets.size() - 1, stream)};
-
-    auto const num_struct_members =
-      std::distance(source_structs.child_begin(), source_structs.child_end());
-    std::vector<std::unique_ptr<column>> child_columns;
-    child_columns.reserve(num_struct_members);
-
-    auto project_member_as_list = [stream, mr](column_view const& structs_member,
-                                               cudf::size_type const& structs_list_num_rows,
-                                               column_view const& structs_list_offsets,
-                                               rmm::device_buffer const& structs_list_nullmask,
-                                               cudf::size_type const& structs_list_null_count) {
-      return cudf::make_lists_column(structs_list_num_rows,
-                                     std::make_unique<column>(structs_list_offsets, stream, mr),
-                                     std::make_unique<column>(structs_member, stream, mr),
-                                     structs_list_null_count,
-                                     rmm::device_buffer(structs_list_nullmask, stream),
-                                     stream,
-                                     mr);
-    };
-
-    auto const iter_source_member_as_list = thrust::make_transform_iterator(
-      thrust::make_counting_iterator<cudf::size_type>(0), [&](auto child_idx) {
-        return project_member_as_list(
-          source_structs.child(child_idx),
-          source_lists_column_view.size(),
-          source_lists_column_view.offsets(),
-          cudf::detail::copy_bitmask(source_lists_column_view.parent(), stream, mr),
-          source_lists_column_view.null_count());
-      });
-
-    auto const iter_target_member_as_list = thrust::make_transform_iterator(
-      thrust::make_counting_iterator<cudf::size_type>(0), [&](auto child_idx) {
-        return project_member_as_list(
-          target_structs.child(child_idx),
-          target_lists_column_view.size(),
-          target_lists_column_view.offsets(),
-          cudf::detail::copy_bitmask(target_lists_column_view.parent(), stream, mr),
-          target_lists_column_view.null_count());
-      });
-
-    std::transform(
-      iter_source_member_as_list,
-      iter_source_member_as_list + num_struct_members,
-      iter_target_member_as_list,
-      std::back_inserter(child_columns),
-      [&](auto source_struct_member_as_list, auto target_struct_member_as_list) {
-        return cudf::type_dispatcher<dispatch_storage_type>(
-          source_struct_member_as_list->child(cudf::lists_column_view::child_column_index).type(),
-          list_child_constructor{},
-          list_vector,
-          list_offsets,
-          cudf::lists_column_view(source_struct_member_as_list->view()),
-          cudf::lists_column_view(target_struct_member_as_list->view()),
-          stream,
-          mr);
-      });
-
-    auto child_null_mask =
-      source_lists_column_view.child().nullable() || target_lists_column_view.child().nullable()
-        ? construct_child_nullmask(
-            list_vector, list_offsets, source_lists, target_lists, num_child_rows, stream, mr)
-        : std::make_pair(rmm::device_buffer{}, 0);
-
-    return cudf::make_structs_column(num_child_rows,
-                                     std::move(child_columns),
-                                     child_null_mask.second,
-                                     std::move(child_null_mask.first),
-                                     stream,
-                                     mr);
-  }
-};
-
-/**
- * @brief Checks that the specified columns have matching schemas, all the way down.
- */
-void assert_same_data_type(column_view const& lhs, column_view const& rhs)
-{
-  CUDF_EXPECTS(lhs.type().id() == rhs.type().id(), "Mismatched Data types.");
-  // Empty string column has no children
-  CUDF_EXPECTS(lhs.type().id() == type_id::STRING or lhs.num_children() == rhs.num_children(),
-               "Mismatched number of child columns.");
-
-  for (int i{0}; i < lhs.num_children(); ++i) { assert_same_data_type(lhs.child(i), rhs.child(i)); }
-}
-
 /**
  * @brief General implementation of scattering into list column
  *
@@ -716,14 +110,13 @@ std::unique_ptr<column> scatter_impl(
   auto offsets_column = cudf::strings::detail::make_offsets_child_column(
     list_size_begin, list_size_begin + target.size(), stream, mr);
 
-  auto child_column = cudf::type_dispatcher<dispatch_storage_type>(child_column_type,
-                                                                   list_child_constructor{},
-                                                                   target_vector,
-                                                                   offsets_column->view(),
-                                                                   source_lists_column_view,
-                                                                   target_lists_column_view,
-                                                                   stream,
-                                                                   mr);
+  auto child_column = build_lists_child_column_recursive(child_column_type,
+                                                         target_vector,
+                                                         offsets_column->view(),
+                                                         source_lists_column_view,
+                                                         target_lists_column_view,
+                                                         stream,
+                                                         mr);
 
   auto null_mask =
     target.has_nulls() ? copy_bitmask(target, stream, mr) : rmm::device_buffer{0, stream, mr};
@@ -737,8 +130,6 @@ std::unique_ptr<column> scatter_impl(
                                  mr);
 }
 
-}  // namespace
-
 /**
  * @brief Scatters lists into a copy of the target column
  * according to a scatter map.
diff --git a/cpp/include/cudf/lists/detail/scatter_helper.cuh b/cpp/include/cudf/lists/detail/scatter_helper.cuh
new file mode 100644
index 00000000000..76121bc35e9
--- /dev/null
+++ b/cpp/include/cudf/lists/detail/scatter_helper.cuh
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/lists/list_device_view.cuh>
+#include <cudf/lists/lists_column_view.hpp>
+
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <memory>
+
+namespace cudf {
+namespace lists {
+namespace detail {
+
+/**
+ * @brief Holder for a list row's positional information, without
+ *        also holding a reference to the list column.
+ *
+ * Analogous to the list_view, this class is default constructable,
+ * and can thus be stored in rmm::device_uvector. It is used to represent
+ * the results of a `scatter()` operation; a device_uvector may hold
+ * several instances of unbound_list_view, each with a flag indicating
+ * whether it came from the scatter source or target. Each instance
+ * may later be "bound" to the appropriate source/target column, to
+ * reconstruct the list_view.
+ */
+struct unbound_list_view {
+  /**
+   * @brief Flag type, indicating whether this list row originated from
+   *        the source or target column, in `scatter()`.
+   */
+  enum class label_type : bool { SOURCE, TARGET };
+
+  using lists_column_device_view = cudf::detail::lists_column_device_view;
+  using list_device_view         = cudf::list_device_view;
+
+  unbound_list_view()                         = default;
+  unbound_list_view(unbound_list_view const&) = default;
+  unbound_list_view(unbound_list_view&&)      = default;
+  unbound_list_view& operator=(unbound_list_view const&) = default;
+  unbound_list_view& operator=(unbound_list_view&&) = default;
+
+  /**
+   * @brief __device__ Constructor, for use from `scatter()`.
+   *
+   * @param scatter_source_label Whether the row came from source or target
+   * @param lists_column The actual source/target lists column
+   * @param row_index Index of the row in lists_column that this instance represents
+   */
+  CUDA_DEVICE_CALLABLE unbound_list_view(label_type scatter_source_label,
+                                         cudf::detail::lists_column_device_view const& lists_column,
+                                         size_type const& row_index)
+    : _label{scatter_source_label}, _row_index{row_index}
+  {
+    _size = list_device_view{lists_column, row_index}.size();
+  }
+
+  /**
+   * @brief __device__ Constructor, for use when constructing the child column
+   *        of a scattered list column
+   *
+   * @param scatter_source_label Whether the row came from source or target
+   * @param row_index Index of the row that this instance represents in the source/target column
+   * @param size The number of elements in this list row
+   */
+  CUDA_DEVICE_CALLABLE unbound_list_view(label_type scatter_source_label,
+                                         size_type const& row_index,
+                                         size_type const& size)
+    : _label{scatter_source_label}, _row_index{row_index}, _size{size}
+  {
+  }
+
+  /**
+   * @brief Returns number of elements in this list row.
+   */
+  CUDA_DEVICE_CALLABLE size_type size() const { return _size; }
+
+  /**
+   * @brief Returns whether this row came from the `scatter()` source or target
+   */
+  CUDA_DEVICE_CALLABLE label_type label() const { return _label; }
+
+  /**
+   * @brief Returns the index in the source/target column
+   */
+  CUDA_DEVICE_CALLABLE size_type row_index() const { return _row_index; }
+
+  /**
+   * @brief Binds to source/target column (depending on SOURCE/TARGET labels),
+   *        to produce a bound list_view.
+   *
+   * @param scatter_source Source column for the scatter operation
+   * @param scatter_target Target column for the scatter operation
+   * @return A (bound) list_view for the row that this object represents
+   */
+  CUDA_DEVICE_CALLABLE list_device_view
+  bind_to_column(lists_column_device_view const& scatter_source,
+                 lists_column_device_view const& scatter_target) const
+  {
+    return list_device_view(_label == label_type::SOURCE ? scatter_source : scatter_target,
+                            _row_index);
+  }
+
+ private:
+  // Note: Cannot store reference to list column, because of storage in device_uvector.
+  // Only keep track of whether this list row came from the source or target of scatter.
+
+  label_type _label{
+    label_type::SOURCE};   // Whether this list row came from the scatter source or target.
+  size_type _row_index{};  // Row index in the Lists column.
+  size_type _size{};       // Number of elements in *this* list row.
+};
+
+/**
+ * @brief Checks that the specified columns have matching schemas, all the way down.
+ */
+void assert_same_data_type(column_view const& lhs, column_view const& rhs);
+
+std::unique_ptr<column> build_lists_child_column_recursive(
+  data_type child_column_type,
+  rmm::device_uvector<unbound_list_view> const& list_vector,
+  cudf::column_view const& list_offsets,
+  cudf::lists_column_view const& source_lists_column_view,
+  cudf::lists_column_view const& target_lists_column_view,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr);
+
+}  // namespace detail
+}  // namespace lists
+}  // namespace cudf
diff --git a/cpp/src/lists/copying/scatter_helper.cu b/cpp/src/lists/copying/scatter_helper.cu
new file mode 100644
index 00000000000..c57327569a4
--- /dev/null
+++ b/cpp/src/lists/copying/scatter_helper.cu
@@ -0,0 +1,511 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column_factories.hpp>
+#include <cudf/copying.hpp>
+#include <cudf/detail/get_value.cuh>
+#include <cudf/detail/valid_if.cuh>
+#include <cudf/lists/detail/copying.hpp>
+#include <cudf/lists/detail/scatter_helper.cuh>
+#include <cudf/strings/detail/utilities.cuh>
+#include <cudf/strings/detail/utilities.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+
+#include <thrust/binary_search.h>
+
+namespace cudf {
+namespace lists {
+namespace detail {
+
+void assert_same_data_type(column_view const& lhs, column_view const& rhs)
+{
+  CUDF_EXPECTS(lhs.type().id() == rhs.type().id(), "Mismatched Data types.");
+  // Empty string column has no children
+  CUDF_EXPECTS(lhs.type().id() == type_id::STRING or lhs.num_children() == rhs.num_children(),
+               "Mismatched number of child columns.");
+
+  for (int i{0}; i < lhs.num_children(); ++i) { assert_same_data_type(lhs.child(i), rhs.child(i)); }
+}
+
+/**
+ * @brief Constructs null mask for a scattered list's child column
+ *
+ * @param parent_list_vector Vector of unbound_list_view, for parent lists column
+ * @param parent_list_offsets List column offsets for parent lists column
+ * @param source_lists Source lists column for scatter operation
+ * @param target_lists Target lists column for scatter operation
+ * @param num_child_rows Number of rows in child column
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate child column's null mask
+ * @return std::pair<rmm::device_buffer, size_type> Child column's null mask and null row count
+ */
+std::pair<rmm::device_buffer, size_type> construct_child_nullmask(
+  rmm::device_uvector<unbound_list_view> const& parent_list_vector,
+  column_view const& parent_list_offsets,
+  cudf::detail::lists_column_device_view const& source_lists,
+  cudf::detail::lists_column_device_view const& target_lists,
+  size_type num_child_rows,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
+{
+  auto is_valid_predicate = [d_list_vector  = parent_list_vector.begin(),
+                             d_offsets      = parent_list_offsets.template data<size_type>(),
+                             d_offsets_size = parent_list_offsets.size(),
+                             source_lists,
+                             target_lists] __device__(auto const& i) {
+    auto list_start =
+      thrust::upper_bound(thrust::seq, d_offsets, d_offsets + d_offsets_size, i) - 1;
+    auto list_index    = list_start - d_offsets;
+    auto element_index = i - *list_start;
+
+    auto list_row = d_list_vector[list_index];
+    return !list_row.bind_to_column(source_lists, target_lists).is_null(element_index);
+  };
+
+  return cudf::detail::valid_if(thrust::make_counting_iterator<size_type>(0),
+                                thrust::make_counting_iterator<size_type>(num_child_rows),
+                                is_valid_predicate,
+                                stream,
+                                mr);
+}
+
+/**
+ * @brief (type_dispatch endpoint) Functor that constructs the child column result
+ *        of `scatter()`ing a list column.
+ *
+ * The protocol is as follows:
+ *
+ * Inputs:
+ *  1. list_vector:  A device_uvector of unbound_list_view, with each element
+ *                   indicating the position, size, and which column the list
+ *                   row came from.
+ *  2. list_offsets: The offsets column for the (outer) lists column, each offset
+ *                   marking the beginning of a list row.
+ *  3. source_list:  The lists-column that is the source of the scatter().
+ *  4. target_list:  The lists-column that is the target of the scatter().
+ *
+ * Output: A (possibly non-list) child column, which may be used in combination
+ *         with list_offsets to fully construct the outer list.
+ *
+ * Example:
+ *
+ * Consider the following scatter operation of two `list<int>` columns:
+ *
+ * 1. Source:      [{9,9,9,9}, {8,8,8}], i.e.
+ *    a. Child:    [9,9,9,9,8,8,8]
+ *    b. Offsets:  [0,      4,    7]
+ *
+ * 2. Target:      [{1,1}, {2,2}, {3,3}], i.e.
+ *    a. Child:    [1,1,2,2,3,3]
+ *    b. Offsets:  [0,  2,  4,  6]
+ *
+ * 3. Scatter-map: [2, 0]
+ *
+ * 4. Expected output: [{8,8,8}, {2,2}, {9,9,9,9}], i.e.
+ *    a. Child:        [8,8,8,2,2,9,9,9,9]  <--- THIS
+ *    b. Offsets:      [0,    3,  5,     9]
+ *
+ * `list_child_constructor` constructs the Expected Child column indicated above.
+ *
+ * `list_child_constructor` expects to be called with the `Source`/`Target`
+ * lists columns, along with the following:
+ *
+ * 1. list_vector: [ S[1](3), T[1](2), S[0](4) ]
+ *    Each unbound_list_view (e.g. S[1](3)) indicates:
+ *      a. Which column the row is bound to: S == Source, T == Target
+ *      b. The list index. E.g. S[1] indicates the 2nd list row of the Source column.
+ *      c. The row size.   E.g. S[1](3) indicates that the row has 3 elements.
+ *
+ * 2. list_offsets: [0, 3, 5, 9]
+ *    The caller may construct this with an `inclusive_scan()` on `list_vector`
+ *    element sizes.
+ */
+struct list_child_constructor {
+ private:
+  /**
+   * @brief Determine whether the child column type is supported with scattering lists.
+   *
+   * @tparam T The data type of the child column of the list being scattered.
+   */
+  template <typename T>
+  struct is_supported_child_type {
+    static const bool value = cudf::is_fixed_width<T>() || std::is_same<T, string_view>::value ||
+                              std::is_same<T, list_view>::value ||
+                              std::is_same<T, struct_view>::value;
+  };
+
+ public:
+  // SFINAE catch-all, for unsupported child column types.
+  template <typename T, typename... Args>
+  std::enable_if_t<!is_supported_child_type<T>::value, std::unique_ptr<column>> operator()(
+    Args&&... args)
+  {
+    CUDF_FAIL("list_child_constructor unsupported!");
+  }
+
+  /**
+   * @brief Implementation for fixed_width child column types.
+   */
+  template <typename T>
+  std::enable_if_t<cudf::is_fixed_width<T>(), std::unique_ptr<column>> operator()(
+    rmm::device_uvector<unbound_list_view> const& list_vector,
+    cudf::column_view const& list_offsets,
+    cudf::lists_column_view const& source_lists_column_view,
+    cudf::lists_column_view const& target_lists_column_view,
+    rmm::cuda_stream_view stream,
+    rmm::mr::device_memory_resource* mr) const
+  {
+    auto source_column_device_view =
+      column_device_view::create(source_lists_column_view.parent(), stream);
+    auto target_column_device_view =
+      column_device_view::create(target_lists_column_view.parent(), stream);
+    auto source_lists = cudf::detail::lists_column_device_view(*source_column_device_view);
+    auto target_lists = cudf::detail::lists_column_device_view(*target_column_device_view);
+
+    auto const num_child_rows{
+      cudf::detail::get_value<size_type>(list_offsets, list_offsets.size() - 1, stream)};
+
+    auto child_null_mask =
+      source_lists_column_view.child().nullable() || target_lists_column_view.child().nullable()
+        ? construct_child_nullmask(
+            list_vector, list_offsets, source_lists, target_lists, num_child_rows, stream, mr)
+        : std::make_pair(rmm::device_buffer{}, 0);
+
+    auto child_column = cudf::make_fixed_width_column(source_lists_column_view.child().type(),
+                                                      num_child_rows,
+                                                      std::move(child_null_mask.first),
+                                                      child_null_mask.second,
+                                                      stream,
+                                                      mr);
+
+    thrust::transform(
+      rmm::exec_policy(stream),
+      thrust::make_counting_iterator(0),
+      thrust::make_counting_iterator(child_column->size()),
+      child_column->mutable_view().begin<T>(),
+      [offset_begin  = list_offsets.begin<offset_type>(),
+       offset_size   = list_offsets.size(),
+       d_list_vector = list_vector.begin(),
+       source_lists,
+       target_lists] __device__(auto index) {
+        auto const list_index_iter =
+          thrust::upper_bound(thrust::seq, offset_begin, offset_begin + offset_size, index);
+        auto const list_index =
+          static_cast<size_type>(thrust::distance(offset_begin, list_index_iter) - 1);
+        auto const intra_index = static_cast<size_type>(index - offset_begin[list_index]);
+        auto actual_list_row = d_list_vector[list_index].bind_to_column(source_lists, target_lists);
+        return actual_list_row.template element<T>(intra_index);
+      });
+
+    return child_column;
+  }
+
+  /**
+   * @brief Implementation for list child columns that contain strings.
+   */
+  template <typename T>
+  std::enable_if_t<std::is_same<T, string_view>::value, std::unique_ptr<column>> operator()(
+    rmm::device_uvector<unbound_list_view> const& list_vector,
+    cudf::column_view const& list_offsets,
+    cudf::lists_column_view const& source_lists_column_view,
+    cudf::lists_column_view const& target_lists_column_view,
+    rmm::cuda_stream_view stream,
+    rmm::mr::device_memory_resource* mr) const
+  {
+    auto source_column_device_view =
+      column_device_view::create(source_lists_column_view.parent(), stream);
+    auto target_column_device_view =
+      column_device_view::create(target_lists_column_view.parent(), stream);
+    auto source_lists = cudf::detail::lists_column_device_view(*source_column_device_view);
+    auto target_lists = cudf::detail::lists_column_device_view(*target_column_device_view);
+
+    auto const num_child_rows{
+      cudf::detail::get_value<size_type>(list_offsets, list_offsets.size() - 1, stream)};
+
+    if (num_child_rows == 0) { return make_empty_column(data_type{type_id::STRING}); }
+
+    auto string_views = rmm::device_uvector<string_view>(num_child_rows, stream);
+
+    thrust::transform(
+      rmm::exec_policy(stream),
+      thrust::make_counting_iterator<size_type>(0),
+      thrust::make_counting_iterator<size_type>(string_views.size()),
+      string_views.begin(),
+      [offset_begin  = list_offsets.begin<offset_type>(),
+       offset_size   = list_offsets.size(),
+       d_list_vector = list_vector.begin(),
+       source_lists,
+       target_lists] __device__(auto index) {
+        auto const list_index_iter =
+          thrust::upper_bound(thrust::seq, offset_begin, offset_begin + offset_size, index);
+        auto const list_index =
+          static_cast<size_type>(thrust::distance(offset_begin, list_index_iter) - 1);
+        auto const intra_index = static_cast<size_type>(index - offset_begin[list_index]);
+        auto row_index         = d_list_vector[list_index].row_index();
+        auto actual_list_row = d_list_vector[list_index].bind_to_column(source_lists, target_lists);
+        auto lists_column    = actual_list_row.get_column();
+        auto lists_offsets_ptr    = lists_column.offsets().template data<offset_type>();
+        auto child_strings_column = lists_column.child();
+        auto string_offsets_ptr =
+          child_strings_column.child(cudf::strings_column_view::offsets_column_index)
+            .template data<offset_type>();
+        auto string_chars_ptr =
+          child_strings_column.child(cudf::strings_column_view::chars_column_index)
+            .template data<char>();
+
+        auto strings_offset = lists_offsets_ptr[row_index] + intra_index;
+        auto char_offset    = string_offsets_ptr[strings_offset];
+        auto char_ptr       = string_chars_ptr + char_offset;
+        auto string_size =
+          string_offsets_ptr[strings_offset + 1] - string_offsets_ptr[strings_offset];
+        return string_view{char_ptr, string_size};
+      });
+
+    // string_views should now have been populated with source and target references.
+
+    auto string_offsets = cudf::strings::detail::child_offsets_from_string_iterator(
+      string_views.begin(), string_views.size(), stream, mr);
+
+    auto string_chars = cudf::strings::detail::child_chars_from_string_vector(
+      string_views, string_offsets->view(), stream, mr);
+    auto child_null_mask =
+      source_lists_column_view.child().nullable() || target_lists_column_view.child().nullable()
+        ? construct_child_nullmask(
+            list_vector, list_offsets, source_lists, target_lists, num_child_rows, stream, mr)
+        : std::make_pair(rmm::device_buffer{}, 0);
+
+    return cudf::make_strings_column(num_child_rows,
+                                     std::move(string_offsets),
+                                     std::move(string_chars),
+                                     child_null_mask.second,            // Null count.
+                                     std::move(child_null_mask.first),  // Null mask.
+                                     stream,
+                                     mr);
+  }
+
+  /**
+   * @brief (Recursively) Constructs a child column that is itself a list column.
+   */
+  template <typename T>
+  std::enable_if_t<std::is_same<T, list_view>::value, std::unique_ptr<column>> operator()(
+    rmm::device_uvector<unbound_list_view> const& list_vector,
+    cudf::column_view const& list_offsets,
+    cudf::lists_column_view const& source_lists_column_view,
+    cudf::lists_column_view const& target_lists_column_view,
+    rmm::cuda_stream_view stream,
+    rmm::mr::device_memory_resource* mr) const
+  {
+    auto source_column_device_view =
+      column_device_view::create(source_lists_column_view.parent(), stream);
+    auto target_column_device_view =
+      column_device_view::create(target_lists_column_view.parent(), stream);
+    auto source_lists = cudf::detail::lists_column_device_view(*source_column_device_view);
+    auto target_lists = cudf::detail::lists_column_device_view(*target_column_device_view);
+
+    auto const num_child_rows{
+      cudf::detail::get_value<size_type>(list_offsets, list_offsets.size() - 1, stream)};
+
+    if (num_child_rows == 0) {
+      // make an empty lists column using the input child type
+      return empty_like(source_lists_column_view.child());
+    }
+
+    auto child_list_views = rmm::device_uvector<unbound_list_view>(num_child_rows, stream, mr);
+
+    // Convert from parent list_device_view instances to child list_device_views.
+    // For instance, if a parent list_device_view has 3 elements, it should have 3 corresponding
+    // child list_device_view instances.
+    thrust::transform(
+      rmm::exec_policy(stream),
+      thrust::make_counting_iterator<size_type>(0),
+      thrust::make_counting_iterator<size_type>(child_list_views.size()),
+      child_list_views.begin(),
+      [offset_begin  = list_offsets.begin<offset_type>(),
+       offset_size   = list_offsets.size(),
+       d_list_vector = list_vector.begin(),
+       source_lists,
+       target_lists] __device__(auto index) {
+        auto const list_index_iter =
+          thrust::upper_bound(thrust::seq, offset_begin, offset_begin + offset_size, index);
+        auto const list_index =
+          static_cast<size_type>(thrust::distance(offset_begin, list_index_iter) - 1);
+        auto const intra_index = static_cast<size_type>(index - offset_begin[list_index]);
+        auto label             = d_list_vector[list_index].label();
+        auto row_index         = d_list_vector[list_index].row_index();
+        auto actual_list_row = d_list_vector[list_index].bind_to_column(source_lists, target_lists);
+        auto lists_column    = actual_list_row.get_column();
+        auto child_lists_column = lists_column.child();
+        auto lists_offsets_ptr  = lists_column.offsets().template data<offset_type>();
+        auto child_lists_offsets_ptr =
+          child_lists_column.child(lists_column_view::offsets_column_index)
+            .template data<offset_type>();
+        auto child_row_index = lists_offsets_ptr[row_index] + intra_index;
+        auto size =
+          child_lists_offsets_ptr[child_row_index + 1] - child_lists_offsets_ptr[child_row_index];
+        return unbound_list_view{label, child_row_index, size};
+      });
+
+    // child_list_views should now have been populated, with source and target references.
+
+    auto begin = thrust::make_transform_iterator(
+      child_list_views.begin(), [] __device__(auto const& row) { return row.size(); });
+
+    auto child_offsets = cudf::strings::detail::make_offsets_child_column(
+      begin, begin + child_list_views.size(), stream, mr);
+
+    auto child_column = cudf::type_dispatcher<dispatch_storage_type>(
+      source_lists_column_view.child().child(1).type(),
+      list_child_constructor{},
+      child_list_views,
+      child_offsets->view(),
+      cudf::lists_column_view(source_lists_column_view.child()),
+      cudf::lists_column_view(target_lists_column_view.child()),
+      stream,
+      mr);
+
+    auto child_null_mask =
+      source_lists_column_view.child().nullable() || target_lists_column_view.child().nullable()
+        ? construct_child_nullmask(
+            list_vector, list_offsets, source_lists, target_lists, num_child_rows, stream, mr)
+        : std::make_pair(rmm::device_buffer{}, 0);
+
+    return cudf::make_lists_column(num_child_rows,
+                                   std::move(child_offsets),
+                                   std::move(child_column),
+                                   child_null_mask.second,            // Null count
+                                   std::move(child_null_mask.first),  // Null mask
+                                   stream,
+                                   mr);
+  }
+
+  /**
+   * @brief (Recursively) constructs child columns that are structs.
+   */
+  template <typename T>
+  std::enable_if_t<std::is_same<T, struct_view>::value, std::unique_ptr<column>> operator()(
+    rmm::device_uvector<unbound_list_view> const& list_vector,
+    cudf::column_view const& list_offsets,
+    cudf::lists_column_view const& source_lists_column_view,
+    cudf::lists_column_view const& target_lists_column_view,
+    rmm::cuda_stream_view stream,
+    rmm::mr::device_memory_resource* mr) const
+  {
+    auto const source_column_device_view =
+      column_device_view::create(source_lists_column_view.parent(), stream);
+    auto const target_column_device_view =
+      column_device_view::create(target_lists_column_view.parent(), stream);
+    auto const source_lists = cudf::detail::lists_column_device_view(*source_column_device_view);
+    auto const target_lists = cudf::detail::lists_column_device_view(*target_column_device_view);
+
+    auto const source_structs = source_lists_column_view.child();
+    auto const target_structs = target_lists_column_view.child();
+
+    auto const num_child_rows{
+      cudf::detail::get_value<size_type>(list_offsets, list_offsets.size() - 1, stream)};
+
+    auto const num_struct_members =
+      std::distance(source_structs.child_begin(), source_structs.child_end());
+    std::vector<std::unique_ptr<column>> child_columns;
+    child_columns.reserve(num_struct_members);
+
+    auto project_member_as_list_view = [](column_view const& structs_member,
+                                          cudf::size_type const& structs_list_num_rows,
+                                          column_view const& structs_list_offsets,
+                                          rmm::device_buffer const& structs_list_nullmask,
+                                          cudf::size_type const& structs_list_null_count) {
+      return lists_column_view(
+        column_view(data_type{type_id::LIST},
+                    structs_list_num_rows,
+                    nullptr,
+                    static_cast<bitmask_type const*>(structs_list_nullmask.data()),
+                    structs_list_null_count,
+                    0,
+                    {structs_list_offsets, structs_member}));
+    };
+
+    auto const iter_source_member_as_list = thrust::make_transform_iterator(
+      thrust::make_counting_iterator<cudf::size_type>(0), [&](auto child_idx) {
+        return project_member_as_list_view(
+          source_structs.child(child_idx),
+          source_lists_column_view.size(),
+          source_lists_column_view.offsets(),
+          cudf::detail::copy_bitmask(source_lists_column_view.parent(), stream, mr),
+          source_lists_column_view.null_count());
+      });
+
+    auto const iter_target_member_as_list = thrust::make_transform_iterator(
+      thrust::make_counting_iterator<cudf::size_type>(0), [&](auto child_idx) {
+        return project_member_as_list_view(
+          target_structs.child(child_idx),
+          target_lists_column_view.size(),
+          target_lists_column_view.offsets(),
+          cudf::detail::copy_bitmask(target_lists_column_view.parent(), stream, mr),
+          target_lists_column_view.null_count());
+      });
+
+    std::transform(iter_source_member_as_list,
+                   iter_source_member_as_list + num_struct_members,
+                   iter_target_member_as_list,
+                   std::back_inserter(child_columns),
+                   [&](auto source_struct_member_list_view, auto target_struct_member_list_view) {
+                     return cudf::type_dispatcher<dispatch_storage_type>(
+                       source_struct_member_list_view.child().type(),
+                       list_child_constructor{},
+                       list_vector,
+                       list_offsets,
+                       source_struct_member_list_view,
+                       target_struct_member_list_view,
+                       stream,
+                       mr);
+                   });
+
+    auto child_null_mask =
+      source_lists_column_view.child().nullable() || target_lists_column_view.child().nullable()
+        ? construct_child_nullmask(
+            list_vector, list_offsets, source_lists, target_lists, num_child_rows, stream, mr)
+        : std::make_pair(rmm::device_buffer{}, 0);
+
+    return cudf::make_structs_column(num_child_rows,
+                                     std::move(child_columns),
+                                     child_null_mask.second,
+                                     std::move(child_null_mask.first),
+                                     stream,
+                                     mr);
+  }
+};
+
+std::unique_ptr<column> build_lists_child_column_recursive(
+  data_type child_column_type,
+  rmm::device_uvector<unbound_list_view> const& list_vector,
+  cudf::column_view const& list_offsets,
+  cudf::lists_column_view const& source_lists_column_view,
+  cudf::lists_column_view const& target_lists_column_view,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
+{
+  return cudf::type_dispatcher<dispatch_storage_type>(child_column_type,
+                                                      list_child_constructor{},
+                                                      list_vector,
+                                                      list_offsets,
+                                                      source_lists_column_view,
+                                                      target_lists_column_view,
+                                                      stream,
+                                                      mr);
+}
+
+}  // namespace detail
+}  // namespace lists
+}  // namespace cudf

From 2ce0835212759a0e2f3efbce34759d5308bf3e40 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 8 Jun 2021 00:36:56 -0400
Subject: [PATCH 12/15] Replace make_empty_strings_column with
 make_empty_column (#8435)

Closes #8352

This PR replaces the `cudf::strings::detail::make_empty_strings_column` call with the existing `cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING})` instead.

This allowed also removing the include of `cudf/strings/detail/utilities.hpp` from some source files as well.

No function has changed. The `make_empty_strings_column` detail utility is removed.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Conor Hoekstra (https://github.com/codereport)

URL: https://github.com/rapidsai/cudf/pull/8435
---
 cpp/include/cudf/strings/detail/copy_if_else.cuh     |  3 +--
 cpp/include/cudf/strings/detail/gather.cuh           |  2 +-
 cpp/include/cudf/strings/detail/merge.cuh            |  3 +--
 cpp/include/cudf/strings/detail/scatter.cuh          |  2 +-
 .../cudf/strings/detail/strings_column_factories.cuh |  5 ++---
 cpp/include/cudf/strings/detail/utilities.hpp        | 11 -----------
 cpp/src/copying/copy.cpp                             | 10 ----------
 cpp/src/interop/from_arrow.cu                        |  3 +--
 cpp/src/io/utilities/column_buffer.cpp               |  3 ---
 cpp/src/reshape/interleave_columns.cu                |  2 +-
 cpp/src/strings/capitalize.cu                        |  5 ++---
 cpp/src/strings/case.cu                              |  3 +--
 cpp/src/strings/combine/concatenate.cu               |  5 ++---
 cpp/src/strings/combine/join.cu                      |  3 +--
 cpp/src/strings/combine/join_list_elements.cu        |  5 ++---
 cpp/src/strings/convert/convert_booleans.cu          |  2 +-
 cpp/src/strings/convert/convert_datetime.cu          |  2 +-
 cpp/src/strings/convert/convert_durations.cu         |  2 +-
 cpp/src/strings/convert/convert_fixed_point.cu       |  2 +-
 cpp/src/strings/convert/convert_floats.cu            |  2 +-
 cpp/src/strings/convert/convert_integers.cu          |  2 +-
 cpp/src/strings/convert/convert_ipv4.cu              |  2 +-
 cpp/src/strings/convert/convert_urls.cu              |  4 ++--
 cpp/src/strings/copying/concatenate.cu               |  3 +--
 cpp/src/strings/copying/copying.cu                   |  3 +--
 cpp/src/strings/filling/fill.cu                      |  2 +-
 cpp/src/strings/filter_chars.cu                      |  3 +--
 cpp/src/strings/padding.cu                           |  4 ++--
 cpp/src/strings/replace/backref_re.cu                |  3 +--
 cpp/src/strings/replace/multi_re.cu                  |  3 +--
 cpp/src/strings/replace/replace.cu                   | 12 ++++++------
 cpp/src/strings/replace/replace_re.cu                |  3 +--
 cpp/src/strings/split/split.cu                       |  3 +--
 cpp/src/strings/strip.cu                             |  3 +--
 cpp/src/strings/substring.cu                         |  5 ++---
 cpp/src/strings/translate.cu                         |  3 +--
 cpp/src/strings/utilities.cu                         | 11 -----------
 cpp/src/strings/wrap.cu                              |  3 +--
 cpp/tests/copying/pack_tests.cu                      |  5 ++---
 cpp/tests/copying/split_tests.cpp                    |  3 +--
 40 files changed, 49 insertions(+), 106 deletions(-)

diff --git a/cpp/include/cudf/strings/detail/copy_if_else.cuh b/cpp/include/cudf/strings/detail/copy_if_else.cuh
index bffcb5c1a31..6d9bd9af8da 100644
--- a/cpp/include/cudf/strings/detail/copy_if_else.cuh
+++ b/cpp/include/cudf/strings/detail/copy_if_else.cuh
@@ -20,7 +20,6 @@
 #include <cudf/detail/get_value.cuh>
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/strings/detail/utilities.cuh>
-#include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -62,7 +61,7 @@ std::unique_ptr<cudf::column> copy_if_else(
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   auto strings_count = std::distance(lhs_begin, lhs_end);
-  if (strings_count == 0) return make_empty_strings_column(stream, mr);
+  if (strings_count == 0) return make_empty_column(data_type{type_id::STRING});
 
   // create null mask
   auto valid_mask = cudf::detail::valid_if(
diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh
index dcd17245ee6..9215f1f5a0f 100644
--- a/cpp/include/cudf/strings/detail/gather.cuh
+++ b/cpp/include/cudf/strings/detail/gather.cuh
@@ -292,7 +292,7 @@ std::unique_ptr<cudf::column> gather(
 {
   auto const output_count  = std::distance(begin, end);
   auto const strings_count = strings.size();
-  if (output_count == 0) return make_empty_strings_column(stream, mr);
+  if (output_count == 0) return make_empty_column(data_type{type_id::STRING});
 
   // allocate offsets column and use memory to compute string size in each output row
   auto out_offsets_column = make_numeric_column(
diff --git a/cpp/include/cudf/strings/detail/merge.cuh b/cpp/include/cudf/strings/detail/merge.cuh
index 8d893a120dc..cd9790b1545 100644
--- a/cpp/include/cudf/strings/detail/merge.cuh
+++ b/cpp/include/cudf/strings/detail/merge.cuh
@@ -20,7 +20,6 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/merge.hpp>
 #include <cudf/strings/detail/utilities.cuh>
-#include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 
@@ -54,7 +53,7 @@ std::unique_ptr<column> merge(strings_column_view const& lhs,
 {
   using cudf::detail::side;
   size_type strings_count = static_cast<size_type>(std::distance(begin, end));
-  if (strings_count == 0) return make_empty_strings_column(stream, mr);
+  if (strings_count == 0) return make_empty_column(data_type{type_id::STRING});
 
   auto lhs_column = column_device_view::create(lhs.parent(), stream);
   auto d_lhs      = *lhs_column;
diff --git a/cpp/include/cudf/strings/detail/scatter.cuh b/cpp/include/cudf/strings/detail/scatter.cuh
index 342afae7336..0522ceace1a 100644
--- a/cpp/include/cudf/strings/detail/scatter.cuh
+++ b/cpp/include/cudf/strings/detail/scatter.cuh
@@ -61,7 +61,7 @@ std::unique_ptr<column> scatter(
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
-  if (target.is_empty()) return make_empty_strings_column(stream, mr);
+  if (target.is_empty()) return make_empty_column(data_type{type_id::STRING});
 
   // create vector of string_view's to scatter into
   rmm::device_uvector<string_view> target_vector = create_string_vector_from_column(target, stream);
diff --git a/cpp/include/cudf/strings/detail/strings_column_factories.cuh b/cpp/include/cudf/strings/detail/strings_column_factories.cuh
index 166deb6560d..7a6006a8292 100644
--- a/cpp/include/cudf/strings/detail/strings_column_factories.cuh
+++ b/cpp/include/cudf/strings/detail/strings_column_factories.cuh
@@ -20,7 +20,6 @@
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/strings/detail/gather.cuh>
 #include <cudf/strings/detail/utilities.cuh>
-#include <cudf/strings/detail/utilities.hpp>
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -63,7 +62,7 @@ std::unique_ptr<column> make_strings_column(IndexPairIterator begin,
 {
   CUDF_FUNC_RANGE();
   size_type strings_count = thrust::distance(begin, end);
-  if (strings_count == 0) return strings::detail::make_empty_strings_column(stream, mr);
+  if (strings_count == 0) return make_empty_column(data_type{type_id::STRING});
 
   using string_index_pair = thrust::pair<const char*, size_type>;
 
@@ -167,7 +166,7 @@ std::unique_ptr<column> make_strings_column(CharIterator chars_begin,
   CUDF_FUNC_RANGE();
   size_type strings_count = thrust::distance(offsets_begin, offsets_end) - 1;
   size_type bytes         = std::distance(chars_begin, chars_end) * sizeof(char);
-  if (strings_count == 0) return strings::detail::make_empty_strings_column(stream, mr);
+  if (strings_count == 0) return make_empty_column(data_type{type_id::STRING});
 
   CUDF_EXPECTS(null_count < strings_count, "null strings column not yet supported");
   CUDF_EXPECTS(bytes >= 0, "invalid offsets data");
diff --git a/cpp/include/cudf/strings/detail/utilities.hpp b/cpp/include/cudf/strings/detail/utilities.hpp
index 4eff3f2dafc..0cee185068e 100644
--- a/cpp/include/cudf/strings/detail/utilities.hpp
+++ b/cpp/include/cudf/strings/detail/utilities.hpp
@@ -41,17 +41,6 @@ std::unique_ptr<column> create_chars_child_column(
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
-/**
- * @brief Create a strings column with no strings.
- *
- * @param stream CUDA stream used for device memory operations and kernel launches.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return Empty strings column
- */
-std::unique_ptr<column> make_empty_strings_column(
-  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
 /**
  * @brief Creates a string_view vector from a strings column.
  *
diff --git a/cpp/src/copying/copy.cpp b/cpp/src/copying/copy.cpp
index 670c147aa7e..8cc5549b3c5 100644
--- a/cpp/src/copying/copy.cpp
+++ b/cpp/src/copying/copy.cpp
@@ -21,7 +21,6 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/lists/lists_column_view.hpp>
-#include <cudf/strings/detail/utilities.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/traits.hpp>
 
@@ -59,15 +58,6 @@ struct scalar_empty_like_functor_impl {
   }
 };
 
-template <>
-struct scalar_empty_like_functor_impl<cudf::string_view> {
-  std::unique_ptr<column> operator()(scalar const& input)
-  {
-    return cudf::strings::detail::make_empty_strings_column(rmm::cuda_stream_default,
-                                                            rmm::mr::get_current_device_resource());
-  }
-};
-
 template <>
 struct scalar_empty_like_functor_impl<cudf::list_view> {
   std::unique_ptr<column> operator()(scalar const& input)
diff --git a/cpp/src/interop/from_arrow.cu b/cpp/src/interop/from_arrow.cu
index 9475d3136e5..ee80f06e861 100644
--- a/cpp/src/interop/from_arrow.cu
+++ b/cpp/src/interop/from_arrow.cu
@@ -26,7 +26,6 @@
 #include <cudf/dictionary/dictionary_factories.hpp>
 #include <cudf/interop.hpp>
 #include <cudf/null_mask.hpp>
-#include <cudf/strings/detail/utilities.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/traits.hpp>
@@ -272,7 +271,7 @@ std::unique_ptr<column> dispatch_to_cudf_column::operator()<cudf::string_view>(
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr)
 {
-  if (array.length() == 0) { return cudf::strings::detail::make_empty_strings_column(stream, mr); }
+  if (array.length() == 0) { return make_empty_column(data_type{type_id::STRING}); }
   auto str_array    = static_cast<arrow::StringArray const*>(&array);
   auto offset_array = std::make_unique<arrow::Int32Array>(
     str_array->value_offsets()->size() / sizeof(int32_t), str_array->value_offsets(), nullptr);
diff --git a/cpp/src/io/utilities/column_buffer.cpp b/cpp/src/io/utilities/column_buffer.cpp
index d60c7e4fad4..40a5d411290 100644
--- a/cpp/src/io/utilities/column_buffer.cpp
+++ b/cpp/src/io/utilities/column_buffer.cpp
@@ -21,7 +21,6 @@
 
 #include "column_buffer.hpp"
 #include <cudf/detail/utilities/vector_factories.hpp>
-#include <cudf/strings/detail/utilities.hpp>
 
 namespace cudf {
 namespace io {
@@ -191,8 +190,6 @@ std::unique_ptr<column> empty_like(column_buffer& buffer,
                                  mr);
     } break;
 
-    case type_id::STRING: return cudf::strings::detail::make_empty_strings_column(stream, mr);
-
     default: return cudf::make_empty_column(buffer.type);
   }
 }
diff --git a/cpp/src/reshape/interleave_columns.cu b/cpp/src/reshape/interleave_columns.cu
index 9024584a16b..9a6df2a664c 100644
--- a/cpp/src/reshape/interleave_columns.cu
+++ b/cpp/src/reshape/interleave_columns.cu
@@ -62,7 +62,7 @@ struct interleave_columns_functor {
 
     auto strings_count = strings_columns.num_rows();
     if (strings_count == 0)  // All columns have 0 rows
-      return strings::detail::make_empty_strings_column(stream, mr);
+      return make_empty_column(data_type{type_id::STRING});
 
     // Create device views from the strings columns.
     auto table       = table_device_view::create(strings_columns, stream);
diff --git a/cpp/src/strings/capitalize.cu b/cpp/src/strings/capitalize.cu
index b6526a5f52c..bbc420d5990 100644
--- a/cpp/src/strings/capitalize.cu
+++ b/cpp/src/strings/capitalize.cu
@@ -24,7 +24,6 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/capitalize.hpp>
 #include <cudf/strings/detail/utilities.cuh>
-#include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 
@@ -172,7 +171,7 @@ std::unique_ptr<column> capitalize(strings_column_view const& input,
                                    rmm::cuda_stream_view stream,
                                    rmm::mr::device_memory_resource* mr)
 {
-  if (input.is_empty()) return detail::make_empty_strings_column(stream, mr);
+  if (input.is_empty()) return make_empty_column(data_type{type_id::STRING});
   auto d_column = column_device_view::create(input.parent(), stream);
   return capitalize_utility(capitalize_fn{*d_column}, input, stream, mr);
 }
@@ -181,7 +180,7 @@ std::unique_ptr<column> title(strings_column_view const& input,
                               rmm::cuda_stream_view stream,
                               rmm::mr::device_memory_resource* mr)
 {
-  if (input.is_empty()) return detail::make_empty_strings_column(stream, mr);
+  if (input.is_empty()) return make_empty_column(data_type{type_id::STRING});
   auto d_column = column_device_view::create(input.parent(), stream);
   return capitalize_utility(title_fn{*d_column}, input, stream, mr);
 }
diff --git a/cpp/src/strings/case.cu b/cpp/src/strings/case.cu
index a7934e6641b..cc0edb49a33 100644
--- a/cpp/src/strings/case.cu
+++ b/cpp/src/strings/case.cu
@@ -24,7 +24,6 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/case.hpp>
 #include <cudf/strings/detail/utilities.cuh>
-#include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/error.hpp>
@@ -126,7 +125,7 @@ std::unique_ptr<column> convert_case(strings_column_view const& strings,
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
 {
-  if (strings.is_empty()) return detail::make_empty_strings_column(stream, mr);
+  if (strings.is_empty()) return make_empty_column(data_type{type_id::STRING});
 
   auto strings_column = column_device_view::create(strings.parent(), stream);
   auto d_column       = *strings_column;
diff --git a/cpp/src/strings/combine/concatenate.cu b/cpp/src/strings/combine/concatenate.cu
index 1329ad3113f..b2db61942d2 100644
--- a/cpp/src/strings/combine/concatenate.cu
+++ b/cpp/src/strings/combine/concatenate.cu
@@ -22,7 +22,6 @@
 #include <cudf/strings/combine.hpp>
 #include <cudf/strings/detail/combine.hpp>
 #include <cudf/strings/detail/utilities.cuh>
-#include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table_device_view.cuh>
@@ -130,7 +129,7 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
                "All columns must be of type string");
   auto const strings_count = strings_columns.num_rows();
   if (strings_count == 0)  // empty begets empty
-    return detail::make_empty_strings_column(stream, mr);
+    return make_empty_column(data_type{type_id::STRING});
 
   CUDF_EXPECTS(separator.is_valid(), "Parameter separator must be a valid string_scalar");
   string_view d_separator(separator.data(), separator.size());
@@ -222,7 +221,7 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
   CUDF_EXPECTS(strings_count == separators.size(),
                "Separators column should be the same size as the strings columns");
   if (strings_count == 0)  // Empty begets empty
-    return detail::make_empty_strings_column(stream, mr);
+    return make_empty_column(data_type{type_id::STRING});
 
   // Invalid output column strings - null rows
   string_view const invalid_str{nullptr, 0};
diff --git a/cpp/src/strings/combine/join.cu b/cpp/src/strings/combine/join.cu
index cdfaf856513..ffe95118f19 100644
--- a/cpp/src/strings/combine/join.cu
+++ b/cpp/src/strings/combine/join.cu
@@ -22,7 +22,6 @@
 #include <cudf/strings/combine.hpp>
 #include <cudf/strings/detail/combine.hpp>
 #include <cudf/strings/detail/utilities.cuh>
-#include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/error.hpp>
@@ -44,7 +43,7 @@ std::unique_ptr<column> join_strings(strings_column_view const& strings,
                                      rmm::mr::device_memory_resource* mr)
 {
   auto strings_count = strings.size();
-  if (strings_count == 0) return detail::make_empty_strings_column(stream, mr);
+  if (strings_count == 0) return make_empty_column(data_type{type_id::STRING});
 
   CUDF_EXPECTS(separator.is_valid(), "Parameter separator must be a valid string_scalar");
 
diff --git a/cpp/src/strings/combine/join_list_elements.cu b/cpp/src/strings/combine/join_list_elements.cu
index 7edb0cd8e7b..c012663794b 100644
--- a/cpp/src/strings/combine/join_list_elements.cu
+++ b/cpp/src/strings/combine/join_list_elements.cu
@@ -22,7 +22,6 @@
 #include <cudf/scalar/scalar_device_view.cuh>
 #include <cudf/strings/combine.hpp>
 #include <cudf/strings/detail/utilities.cuh>
-#include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/error.hpp>
 
@@ -160,7 +159,7 @@ std::unique_ptr<column> join_list_elements(lists_column_view const& lists_string
   CUDF_EXPECTS(separator.is_valid(), "Parameter separator must be a valid string_scalar");
 
   auto const num_rows = lists_strings_column.size();
-  if (num_rows == 0) { return detail::make_empty_strings_column(stream, mr); }
+  if (num_rows == 0) { return make_empty_column(data_type{type_id::STRING}); }
 
   // Accessing the child strings column of the lists column must be done by calling `child()` on the
   // lists column, not `get_sliced_child()`. This is because calling to `offsets_begin()` on the
@@ -233,7 +232,7 @@ std::unique_ptr<column> join_list_elements(lists_column_view const& lists_string
                "Separators column should be the same size as the lists columns");
 
   auto const num_rows = lists_strings_column.size();
-  if (num_rows == 0) { return detail::make_empty_strings_column(stream, mr); }
+  if (num_rows == 0) { return make_empty_column(data_type{type_id::STRING}); }
 
   // Accessing the child strings column of the lists column must be done by calling `child()` on the
   // lists column, not `get_sliced_child()`. This is because calling to `offsets_begin()` on the
diff --git a/cpp/src/strings/convert/convert_booleans.cu b/cpp/src/strings/convert/convert_booleans.cu
index b0543366a8e..c69bb39bdae 100644
--- a/cpp/src/strings/convert/convert_booleans.cu
+++ b/cpp/src/strings/convert/convert_booleans.cu
@@ -96,7 +96,7 @@ std::unique_ptr<column> from_booleans(column_view const& booleans,
                                       rmm::mr::device_memory_resource* mr)
 {
   size_type strings_count = booleans.size();
-  if (strings_count == 0) return make_empty_strings_column(stream, mr);
+  if (strings_count == 0) return make_empty_column(data_type{type_id::STRING});
 
   CUDF_EXPECTS(booleans.type().id() == type_id::BOOL8, "Input column must be boolean type");
   CUDF_EXPECTS(true_string.is_valid() && true_string.size() > 0,
diff --git a/cpp/src/strings/convert/convert_datetime.cu b/cpp/src/strings/convert/convert_datetime.cu
index 637a612472b..379ceceaf17 100644
--- a/cpp/src/strings/convert/convert_datetime.cu
+++ b/cpp/src/strings/convert/convert_datetime.cu
@@ -933,7 +933,7 @@ std::unique_ptr<column> from_timestamps(column_view const& timestamps,
                                         rmm::mr::device_memory_resource* mr)
 {
   size_type strings_count = timestamps.size();
-  if (strings_count == 0) return make_empty_strings_column(stream, mr);
+  if (strings_count == 0) return make_empty_column(data_type{type_id::STRING});
 
   CUDF_EXPECTS(!format.empty(), "Format parameter must not be empty.");
   timestamp_units units =
diff --git a/cpp/src/strings/convert/convert_durations.cu b/cpp/src/strings/convert/convert_durations.cu
index 325fa428cc6..6923f8a24fd 100644
--- a/cpp/src/strings/convert/convert_durations.cu
+++ b/cpp/src/strings/convert/convert_durations.cu
@@ -705,7 +705,7 @@ std::unique_ptr<column> from_durations(column_view const& durations,
                                        rmm::mr::device_memory_resource* mr)
 {
   size_type strings_count = durations.size();
-  if (strings_count == 0) return make_empty_strings_column(stream, mr);
+  if (strings_count == 0) return make_empty_column(data_type{type_id::STRING});
 
   return type_dispatcher(
     durations.type(), dispatch_from_durations_fn{}, durations, format, stream, mr);
diff --git a/cpp/src/strings/convert/convert_fixed_point.cu b/cpp/src/strings/convert/convert_fixed_point.cu
index 5674f546c8c..3b2616ebc4f 100644
--- a/cpp/src/strings/convert/convert_fixed_point.cu
+++ b/cpp/src/strings/convert/convert_fixed_point.cu
@@ -427,7 +427,7 @@ std::unique_ptr<column> from_fixed_point(column_view const& input,
                                          rmm::cuda_stream_view stream,
                                          rmm::mr::device_memory_resource* mr)
 {
-  if (input.is_empty()) return detail::make_empty_strings_column(stream, mr);
+  if (input.is_empty()) return make_empty_column(data_type{type_id::STRING});
   return type_dispatcher(input.type(), dispatch_from_fixed_point_fn{}, input, stream, mr);
 }
 
diff --git a/cpp/src/strings/convert/convert_floats.cu b/cpp/src/strings/convert/convert_floats.cu
index 0cdbea1d4ef..05142e7e5f2 100644
--- a/cpp/src/strings/convert/convert_floats.cu
+++ b/cpp/src/strings/convert/convert_floats.cu
@@ -527,7 +527,7 @@ std::unique_ptr<column> from_floats(column_view const& floats,
                                     rmm::mr::device_memory_resource* mr)
 {
   size_type strings_count = floats.size();
-  if (strings_count == 0) return detail::make_empty_strings_column(stream, mr);
+  if (strings_count == 0) return make_empty_column(data_type{type_id::STRING});
 
   return type_dispatcher(floats.type(), dispatch_from_floats_fn{}, floats, stream, mr);
 }
diff --git a/cpp/src/strings/convert/convert_integers.cu b/cpp/src/strings/convert/convert_integers.cu
index 575058eae09..01da56d7254 100644
--- a/cpp/src/strings/convert/convert_integers.cu
+++ b/cpp/src/strings/convert/convert_integers.cu
@@ -392,7 +392,7 @@ std::unique_ptr<column> from_integers(column_view const& integers,
                                       rmm::mr::device_memory_resource* mr)
 {
   size_type strings_count = integers.size();
-  if (strings_count == 0) return detail::make_empty_strings_column(stream, mr);
+  if (strings_count == 0) return make_empty_column(data_type{type_id::STRING});
 
   return type_dispatcher(integers.type(), dispatch_from_integers_fn{}, integers, stream, mr);
 }
diff --git a/cpp/src/strings/convert/convert_ipv4.cu b/cpp/src/strings/convert/convert_ipv4.cu
index 016b9befe5c..2984069ea23 100644
--- a/cpp/src/strings/convert/convert_ipv4.cu
+++ b/cpp/src/strings/convert/convert_ipv4.cu
@@ -163,7 +163,7 @@ std::unique_ptr<column> integers_to_ipv4(
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   size_type strings_count = integers.size();
-  if (strings_count == 0) return make_empty_strings_column(stream, mr);
+  if (strings_count == 0) return make_empty_column(data_type{type_id::STRING});
 
   CUDF_EXPECTS(integers.type().id() == type_id::INT64, "Input column must be type_id::INT64 type");
 
diff --git a/cpp/src/strings/convert/convert_urls.cu b/cpp/src/strings/convert/convert_urls.cu
index 3be81dbd005..ce145e8a413 100644
--- a/cpp/src/strings/convert/convert_urls.cu
+++ b/cpp/src/strings/convert/convert_urls.cu
@@ -126,7 +126,7 @@ std::unique_ptr<column> url_encode(
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   size_type strings_count = strings.size();
-  if (strings_count == 0) return make_empty_strings_column(stream, mr);
+  if (strings_count == 0) return make_empty_column(data_type{type_id::STRING});
 
   auto strings_column = column_device_view::create(strings.parent(), stream);
   auto d_strings      = *strings_column;
@@ -326,7 +326,7 @@ std::unique_ptr<column> url_decode(
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   size_type strings_count = strings.size();
-  if (strings_count == 0) return make_empty_strings_column(stream, mr);
+  if (strings_count == 0) return make_empty_column(data_type{type_id::STRING});
 
   auto offset_count = strings_count + 1;
   auto d_offsets    = strings.offsets().data<int32_t>() + strings.offset();
diff --git a/cpp/src/strings/copying/concatenate.cu b/cpp/src/strings/copying/concatenate.cu
index 48358cb4a38..2c7d6ebb483 100644
--- a/cpp/src/strings/copying/concatenate.cu
+++ b/cpp/src/strings/copying/concatenate.cu
@@ -22,7 +22,6 @@
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/strings/detail/concatenate.hpp>
-#include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table_device_view.cuh>
 
@@ -217,7 +216,7 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
   auto const total_bytes          = std::get<5>(device_views);
   auto const offsets_count        = strings_count + 1;
 
-  if (strings_count == 0) { return make_empty_strings_column(stream, mr); }
+  if (strings_count == 0) { return make_empty_column(data_type{type_id::STRING}); }
 
   CUDF_EXPECTS(offsets_count <= static_cast<std::size_t>(std::numeric_limits<size_type>::max()),
                "total number of strings is too large for cudf column");
diff --git a/cpp/src/strings/copying/copying.cu b/cpp/src/strings/copying/copying.cu
index cdf188bfdc5..a6b1d227f74 100644
--- a/cpp/src/strings/copying/copying.cu
+++ b/cpp/src/strings/copying/copying.cu
@@ -19,7 +19,6 @@
 #include <cudf/detail/get_value.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/strings/detail/copying.hpp>
-#include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -35,7 +34,7 @@ std::unique_ptr<cudf::column> copy_slice(strings_column_view const& strings,
                                          rmm::cuda_stream_view stream,
                                          rmm::mr::device_memory_resource* mr)
 {
-  if (strings.is_empty()) return make_empty_strings_column(stream, mr);
+  if (strings.is_empty()) return make_empty_column(data_type{type_id::STRING});
   if (end < 0 || end > strings.size()) end = strings.size();
   CUDF_EXPECTS(((start >= 0) && (start < end)), "Invalid start parameter value.");
   auto const strings_count  = end - start;
diff --git a/cpp/src/strings/filling/fill.cu b/cpp/src/strings/filling/fill.cu
index 996fdf9997b..25e9f7a2412 100644
--- a/cpp/src/strings/filling/fill.cu
+++ b/cpp/src/strings/filling/fill.cu
@@ -40,7 +40,7 @@ std::unique_ptr<column> fill(
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   auto strings_count = strings.size();
-  if (strings_count == 0) return detail::make_empty_strings_column(stream, mr);
+  if (strings_count == 0) return make_empty_column(data_type{type_id::STRING});
   CUDF_EXPECTS((begin >= 0) && (end <= strings_count),
                "Parameters [begin,end) are outside the range of the provided strings column");
   CUDF_EXPECTS(begin <= end, "Parameters [begin,end) have invalid range values");
diff --git a/cpp/src/strings/filter_chars.cu b/cpp/src/strings/filter_chars.cu
index 8c8a2ab05b1..be0715c97df 100644
--- a/cpp/src/strings/filter_chars.cu
+++ b/cpp/src/strings/filter_chars.cu
@@ -22,7 +22,6 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/detail/utilities.cuh>
-#include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/strings/translate.hpp>
@@ -116,7 +115,7 @@ std::unique_ptr<column> filter_characters(
   rmm::mr::device_memory_resource* mr)
 {
   size_type strings_count = strings.size();
-  if (strings_count == 0) return make_empty_strings_column(stream, mr);
+  if (strings_count == 0) return make_empty_column(data_type{type_id::STRING});
   CUDF_EXPECTS(replacement.is_valid(), "Parameter replacement must be valid");
   cudf::string_view d_replacement(replacement.data(), replacement.size());
 
diff --git a/cpp/src/strings/padding.cu b/cpp/src/strings/padding.cu
index 22bbb7e7a4a..bce2ee52c1c 100644
--- a/cpp/src/strings/padding.cu
+++ b/cpp/src/strings/padding.cu
@@ -61,7 +61,7 @@ std::unique_ptr<column> pad(
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   size_type strings_count = strings.size();
-  if (strings_count == 0) return make_empty_strings_column(stream, mr);
+  if (strings_count == 0) return make_empty_column(data_type{type_id::STRING});
   CUDF_EXPECTS(!fill_char.empty(), "fill_char parameter must not be empty");
   char_utf8 d_fill_char    = 0;
   size_type fill_char_size = to_char_utf8(fill_char.c_str(), d_fill_char);
@@ -151,7 +151,7 @@ std::unique_ptr<column> zfill(
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   size_type strings_count = strings.size();
-  if (strings_count == 0) return make_empty_strings_column(stream, mr);
+  if (strings_count == 0) return make_empty_column(data_type{type_id::STRING});
 
   auto strings_column = column_device_view::create(strings.parent(), stream);
   auto d_strings      = *strings_column;
diff --git a/cpp/src/strings/replace/backref_re.cu b/cpp/src/strings/replace/backref_re.cu
index 07d5fefc264..2de5141bb00 100644
--- a/cpp/src/strings/replace/backref_re.cu
+++ b/cpp/src/strings/replace/backref_re.cu
@@ -24,7 +24,6 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/replace_re.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -87,7 +86,7 @@ std::unique_ptr<column> replace_with_backrefs(
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
-  if (strings.is_empty()) return make_empty_strings_column(stream, mr);
+  if (strings.is_empty()) return make_empty_column(data_type{type_id::STRING});
 
   CUDF_EXPECTS(!pattern.empty(), "Parameter pattern must not be empty");
   CUDF_EXPECTS(!repl.empty(), "Parameter repl must not be empty");
diff --git a/cpp/src/strings/replace/multi_re.cu b/cpp/src/strings/replace/multi_re.cu
index a59401db24f..12bf0810a64 100644
--- a/cpp/src/strings/replace/multi_re.cu
+++ b/cpp/src/strings/replace/multi_re.cu
@@ -23,7 +23,6 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/detail/utilities.cuh>
-#include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/replace_re.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -135,7 +134,7 @@ std::unique_ptr<column> replace_re(
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   auto strings_count = strings.size();
-  if (strings_count == 0) return make_empty_strings_column(stream, mr);
+  if (strings_count == 0) return make_empty_column(data_type{type_id::STRING});
   if (patterns.empty())  // no patterns; just return a copy
     return std::make_unique<column>(strings.parent(), stream, mr);
 
diff --git a/cpp/src/strings/replace/replace.cu b/cpp/src/strings/replace/replace.cu
index 31c6460267c..02e861433a9 100644
--- a/cpp/src/strings/replace/replace.cu
+++ b/cpp/src/strings/replace/replace.cu
@@ -551,7 +551,7 @@ std::unique_ptr<column> replace<replace_algorithm::AUTO>(strings_column_view con
                                                          rmm::cuda_stream_view stream,
                                                          rmm::mr::device_memory_resource* mr)
 {
-  if (strings.is_empty()) return make_empty_strings_column(stream, mr);
+  if (strings.is_empty()) return make_empty_column(data_type{type_id::STRING});
   if (maxrepl == 0) return std::make_unique<cudf::column>(strings.parent(), stream, mr);
   CUDF_EXPECTS(repl.is_valid(), "Parameter repl must be valid.");
   CUDF_EXPECTS(target.is_valid(), "Parameter target must be valid.");
@@ -590,7 +590,7 @@ std::unique_ptr<column> replace<replace_algorithm::CHAR_PARALLEL>(
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr)
 {
-  if (strings.is_empty()) return make_empty_strings_column(stream, mr);
+  if (strings.is_empty()) return make_empty_column(data_type{type_id::STRING});
   if (maxrepl == 0) return std::make_unique<cudf::column>(strings.parent(), stream, mr);
   CUDF_EXPECTS(repl.is_valid(), "Parameter repl must be valid.");
   CUDF_EXPECTS(target.is_valid(), "Parameter target must be valid.");
@@ -623,7 +623,7 @@ std::unique_ptr<column> replace<replace_algorithm::ROW_PARALLEL>(
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr)
 {
-  if (strings.is_empty()) return make_empty_strings_column(stream, mr);
+  if (strings.is_empty()) return make_empty_column(data_type{type_id::STRING});
   if (maxrepl == 0) return std::make_unique<cudf::column>(strings.parent(), stream, mr);
   CUDF_EXPECTS(repl.is_valid(), "Parameter repl must be valid.");
   CUDF_EXPECTS(target.is_valid(), "Parameter target must be valid.");
@@ -683,7 +683,7 @@ std::unique_ptr<column> replace_slice(strings_column_view const& strings,
                                       rmm::cuda_stream_view stream,
                                       rmm::mr::device_memory_resource* mr)
 {
-  if (strings.is_empty()) return make_empty_strings_column(stream, mr);
+  if (strings.is_empty()) return make_empty_column(data_type{type_id::STRING});
   CUDF_EXPECTS(repl.is_valid(), "Parameter repl must be valid.");
   if (stop > 0) CUDF_EXPECTS(start <= stop, "Parameter start must be less than or equal to stop.");
 
@@ -767,7 +767,7 @@ std::unique_ptr<column> replace(strings_column_view const& strings,
                                 rmm::cuda_stream_view stream,
                                 rmm::mr::device_memory_resource* mr)
 {
-  if (strings.is_empty()) return make_empty_strings_column(stream, mr);
+  if (strings.is_empty()) return make_empty_column(data_type{type_id::STRING});
   CUDF_EXPECTS(((targets.size() > 0) && (targets.null_count() == 0)),
                "Parameters targets must not be empty and must not have nulls");
   CUDF_EXPECTS(((repls.size() > 0) && (repls.null_count() == 0)),
@@ -798,7 +798,7 @@ std::unique_ptr<column> replace_nulls(strings_column_view const& strings,
                                       rmm::mr::device_memory_resource* mr)
 {
   size_type strings_count = strings.size();
-  if (strings_count == 0) return make_empty_strings_column(stream, mr);
+  if (strings_count == 0) return make_empty_column(data_type{type_id::STRING});
   CUDF_EXPECTS(repl.is_valid(), "Parameter repl must be valid.");
 
   string_view d_repl(repl.data(), repl.size());
diff --git a/cpp/src/strings/replace/replace_re.cu b/cpp/src/strings/replace/replace_re.cu
index 9468e80fa1c..9781a3fe7c6 100644
--- a/cpp/src/strings/replace/replace_re.cu
+++ b/cpp/src/strings/replace/replace_re.cu
@@ -23,7 +23,6 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/detail/utilities.cuh>
-#include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/replace_re.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -109,7 +108,7 @@ std::unique_ptr<column> replace_re(
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   auto strings_count = strings.size();
-  if (strings_count == 0) return make_empty_strings_column(stream, mr);
+  if (strings_count == 0) return make_empty_column(data_type{type_id::STRING});
 
   CUDF_EXPECTS(repl.is_valid(), "Parameter repl must be valid");
   string_view d_repl(repl.data(), repl.size());
diff --git a/cpp/src/strings/split/split.cu b/cpp/src/strings/split/split.cu
index be6ace5e1fe..5194bc6e86a 100644
--- a/cpp/src/strings/split/split.cu
+++ b/cpp/src/strings/split/split.cu
@@ -23,7 +23,6 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/detail/strings_column_factories.cuh>
-#include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/split/split.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -432,7 +431,7 @@ std::unique_ptr<table> split_fn(strings_column_view const& strings_column,
   std::vector<std::unique_ptr<column>> results;
   auto const strings_count = strings_column.size();
   if (strings_count == 0) {
-    results.push_back(make_empty_strings_column(stream, mr));
+    results.push_back(make_empty_column(data_type{type_id::STRING}));
     return std::make_unique<table>(std::move(results));
   }
 
diff --git a/cpp/src/strings/strip.cu b/cpp/src/strings/strip.cu
index 95d8eae36d4..8eb56918071 100644
--- a/cpp/src/strings/strip.cu
+++ b/cpp/src/strings/strip.cu
@@ -19,7 +19,6 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/detail/utilities.cuh>
-#include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/strings/strip.hpp>
@@ -101,7 +100,7 @@ std::unique_ptr<column> strip(
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
-  if (strings.is_empty()) return detail::make_empty_strings_column(stream, mr);
+  if (strings.is_empty()) return make_empty_column(data_type{type_id::STRING});
 
   CUDF_EXPECTS(to_strip.is_valid(), "Parameter to_strip must be valid");
   string_view const d_to_strip(to_strip.data(), to_strip.size());
diff --git a/cpp/src/strings/substring.cu b/cpp/src/strings/substring.cu
index 3e7c1181a25..a74f6638a61 100644
--- a/cpp/src/strings/substring.cu
+++ b/cpp/src/strings/substring.cu
@@ -24,7 +24,6 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/scalar/scalar_device_view.cuh>
 #include <cudf/strings/detail/utilities.cuh>
-#include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/strings/substring.hpp>
@@ -110,7 +109,7 @@ std::unique_ptr<column> slice_strings(
   rmm::cuda_stream_view stream           = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr    = rmm::mr::get_current_device_resource())
 {
-  if (strings.is_empty()) return make_empty_strings_column(stream, mr);
+  if (strings.is_empty()) return make_empty_column(data_type{type_id::STRING});
 
   if (step.is_valid()) CUDF_EXPECTS(step.value(stream) != 0, "Step parameter must not be 0");
 
@@ -300,7 +299,7 @@ std::unique_ptr<column> slice_strings(
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   size_type strings_count = strings.size();
-  if (strings_count == 0) return make_empty_strings_column(stream, mr);
+  if (strings_count == 0) return make_empty_column(data_type{type_id::STRING});
   CUDF_EXPECTS(starts_column.size() == strings_count,
                "Parameter starts must have the same number of rows as strings.");
   CUDF_EXPECTS(stops_column.size() == strings_count,
diff --git a/cpp/src/strings/translate.cu b/cpp/src/strings/translate.cu
index 30fc7d50b3f..cbb2f85c6a6 100644
--- a/cpp/src/strings/translate.cu
+++ b/cpp/src/strings/translate.cu
@@ -20,7 +20,6 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/detail/utilities.cuh>
-#include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/strings/translate.hpp>
@@ -88,7 +87,7 @@ std::unique_ptr<column> translate(
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
-  if (strings.is_empty()) return make_empty_strings_column(stream, mr);
+  if (strings.is_empty()) return make_empty_column(data_type{type_id::STRING});
 
   size_type table_size = static_cast<size_type>(chars_table.size());
   // convert input table
diff --git a/cpp/src/strings/utilities.cu b/cpp/src/strings/utilities.cu
index 1b7141ccb18..3326bcab82f 100644
--- a/cpp/src/strings/utilities.cu
+++ b/cpp/src/strings/utilities.cu
@@ -109,17 +109,6 @@ std::unique_ptr<column> create_chars_child_column(cudf::size_type strings_count,
     data_type{type_id::INT8}, total_bytes, mask_state::UNALLOCATED, stream, mr);
 }
 
-//
-std::unique_ptr<column> make_empty_strings_column(rmm::cuda_stream_view stream,
-                                                  rmm::mr::device_memory_resource* mr)
-{
-  return std::make_unique<column>(data_type{type_id::STRING},
-                                  0,
-                                  rmm::device_buffer{0, stream, mr},  // data
-                                  rmm::device_buffer{0, stream, mr},
-                                  0);  // nulls
-}
-
 namespace {
 // The device variables are created here to avoid using a singleton that may cause issues
 // with RMM initialize/finalize. See PR #3159 for details on this approach.
diff --git a/cpp/src/strings/wrap.cu b/cpp/src/strings/wrap.cu
index f21477b3582..a9346566e78 100644
--- a/cpp/src/strings/wrap.cu
+++ b/cpp/src/strings/wrap.cu
@@ -23,7 +23,6 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/case.hpp>
 #include <cudf/strings/detail/utilities.cuh>
-#include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/error.hpp>
@@ -99,7 +98,7 @@ std::unique_ptr<column> wrap(
   CUDF_EXPECTS(width > 0, "Positive wrap width required");
 
   auto strings_count = strings.size();
-  if (strings_count == 0) return detail::make_empty_strings_column(stream, mr);
+  if (strings_count == 0) return make_empty_column(data_type{type_id::STRING});
 
   auto strings_column  = column_device_view::create(strings.parent(), stream);
   auto d_column        = *strings_column;
diff --git a/cpp/tests/copying/pack_tests.cu b/cpp/tests/copying/pack_tests.cu
index 84cf176061d..f3b9cf25357 100644
--- a/cpp/tests/copying/pack_tests.cu
+++ b/cpp/tests/copying/pack_tests.cu
@@ -15,7 +15,6 @@
  */
 
 #include <cudf/copying.hpp>
-#include <cudf/strings/detail/utilities.hpp>
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/table_utilities.hpp>
@@ -84,7 +83,7 @@ TEST_F(PackUnpackTest, MultiColumnWithStrings)
 TEST_F(PackUnpackTest, EmptyColumns)
 {
   {
-    auto empty_string = cudf::strings::detail::make_empty_strings_column();    
+    auto empty_string = cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});
     cudf::table_view src_table({static_cast<cudf::column_view>(*empty_string)});
     this->run_test(src_table);
   }
@@ -356,7 +355,7 @@ TEST_F(PackUnpackTest, NestedEmpty)
   // this produces an empty strings column with no children,
   // nested inside a list
   {
-    auto empty_string = cudf::strings::detail::make_empty_strings_column();
+    auto empty_string = cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});
     auto offsets      = cudf::test::fixed_width_column_wrapper<int>({0, 0});
     auto list         = cudf::make_lists_column(
       1, offsets.release(), std::move(empty_string), 0, rmm::device_buffer{});
diff --git a/cpp/tests/copying/split_tests.cpp b/cpp/tests/copying/split_tests.cpp
index d4e5a53aa85..c0c1ff727c6 100644
--- a/cpp/tests/copying/split_tests.cpp
+++ b/cpp/tests/copying/split_tests.cpp
@@ -17,7 +17,6 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/copying.hpp>
 #include <cudf/detail/iterator.cuh>
-#include <cudf/strings/detail/utilities.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <cudf_test/base_fixture.hpp>
@@ -1546,7 +1545,7 @@ TEST_F(ContiguousSplitTableCornerCases, NestedEmpty)
   // this produces an empty strings column with no children,
   // nested inside a list
   {
-    auto empty_string = cudf::strings::detail::make_empty_strings_column();
+    auto empty_string = cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});
     auto offsets      = cudf::test::fixed_width_column_wrapper<int>({0, 0});
     auto list         = cudf::make_lists_column(
       1, offsets.release(), std::move(empty_string), 0, rmm::device_buffer{});

From aa8264624824a108d926e51cd3be287adccc5225 Mon Sep 17 00:00:00 2001
From: Mark Harris <mharris@nvidia.com>
Date: Tue, 8 Jun 2021 21:29:23 +1000
Subject: [PATCH 13/15] Adapt `cudf::scalar` classes to changes in
 `rmm::device_scalar` (#8411)

rapidsai/rmm#789 refactors `rmm::device_scalar`, which all of `cudf::scalar` depends on. Notably, it renames some methods, makes stream parameters explicit, and deletes streamless constructors. As a result, the present PR deletes the default and non-stream copy constructors of all the `cudf::*_scalar` classes.

This should be merged immediately after rapidsai/rmm#789 because that PR will break the build.

Authors:
  - Mark Harris (https://github.com/harrism)

Approvers:
  - https://github.com/brandon-b-miller
  - Vukasin Milovanovic (https://github.com/vuule)
  - Robert (Bobby) Evans (https://github.com/revans2)
  - Robert Maynard (https://github.com/robertmaynard)

URL: https://github.com/rapidsai/cudf/pull/8411
---
 cpp/include/cudf/detail/copy_if_else.cuh      |   2 +-
 cpp/include/cudf/detail/copy_range.cuh        |   2 +-
 cpp/include/cudf/scalar/scalar.hpp            |  31 ++--
 cpp/src/copying/get_element.cu                |  10 +-
 cpp/src/io/json/reader_impl.cu                |   6 +-
 cpp/src/io/json/reader_impl.hpp               |   5 +-
 cpp/src/join/hash_join.cu                     |   2 +-
 cpp/src/join/hash_join.cuh                    |   2 +-
 cpp/src/reductions/compound.cuh               |   2 +-
 cpp/src/reductions/reductions.cpp             |   2 +-
 cpp/src/reductions/simple.cuh                 |   4 +-
 cpp/src/scalar/scalar.cpp                     |  29 +---
 cpp/src/strings/attributes.cu                 |   4 +-
 cpp/src/strings/combine/join.cu               |   6 +-
 cpp/src/strings/copying/concatenate.cu        |   5 +-
 cpp/src/strings/json/json_path.cu             |   2 +-
 cpp/src/text/ngrams_tokenize.cu               |   7 +-
 cpp/src/text/tokenize.cu                      |   3 +-
 cpp/tests/binaryop/binop-integration-test.cpp |  14 +-
 cpp/tests/binaryop/binop-null-test.cpp        |   4 +-
 cpp/tests/copying/scatter_tests.cpp           |   8 +-
 cpp/tests/filling/fill_tests.cpp              |   6 +-
 cpp/tests/lists/contains_tests.cpp            |  12 +-
 cpp/tests/replace/clamp_test.cpp              |  92 ++++++------
 cpp/tests/replace/replace_nulls_tests.cpp     |   2 +-
 cpp/tests/scalar/scalar_device_view_test.cu   |  23 +--
 cpp/tests/scalar/scalar_test.cpp              |  23 +--
 java/src/main/native/src/ColumnVectorJni.cpp  | 138 ++++++++----------
 java/src/main/native/src/ScalarJni.cpp        |  54 ++++---
 java/src/main/native/src/map_lookup.cu        |  16 +-
 java/src/main/native/src/row_conversion.cu    |   4 +-
 .../test/java/ai/rapids/cudf/ScalarTest.java  |  66 ++++-----
 python/cudf/cudf/_lib/cpp/scalar/scalar.pxd   |   2 +-
 33 files changed, 274 insertions(+), 314 deletions(-)

diff --git a/cpp/include/cudf/detail/copy_if_else.cuh b/cpp/include/cudf/detail/copy_if_else.cuh
index 3faba5ef51b..1acdcadaacf 100644
--- a/cpp/include/cudf/detail/copy_if_else.cuh
+++ b/cpp/include/cudf/detail/copy_if_else.cuh
@@ -190,7 +190,7 @@ std::unique_ptr<column> copy_if_else(
       <<<grid.num_blocks, block_size, 0, stream.value()>>>(
         lhs_begin, rhs, filter, *out_v, valid_count.data());
 
-    out->set_null_count(size - valid_count.value());
+    out->set_null_count(size - valid_count.value(stream));
   } else {
     // call the kernel
     copy_if_else_kernel<block_size, Element, LeftIter, RightIter, FilterFn, false>
diff --git a/cpp/include/cudf/detail/copy_range.cuh b/cpp/include/cudf/detail/copy_range.cuh
index b7479f828ec..ac59b429a2c 100644
--- a/cpp/include/cudf/detail/copy_range.cuh
+++ b/cpp/include/cudf/detail/copy_range.cuh
@@ -172,7 +172,7 @@ void copy_range(SourceValueIterator source_value_begin,
       target_end,
       null_count.data());
 
-    target.set_null_count(null_count.value());
+    target.set_null_count(null_count.value(stream));
   } else {
     auto kernel =
       copy_range_kernel<block_size, SourceValueIterator, SourceValidityIterator, T, false>;
diff --git a/cpp/include/cudf/scalar/scalar.hpp b/cpp/include/cudf/scalar/scalar.hpp
index 6938ad5feaa..e4d7baf86d9 100644
--- a/cpp/include/cudf/scalar/scalar.hpp
+++ b/cpp/include/cudf/scalar/scalar.hpp
@@ -73,7 +73,7 @@ class scalar {
    * @param is_valid true: set the value to valid. false: set it to null
    * @param stream CUDA stream used for device memory operations.
    */
-  void set_valid(bool is_valid, rmm::cuda_stream_view stream = rmm::cuda_stream_default);
+  void set_valid_async(bool is_valid, rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
   /**
    * @brief Indicates whether the scalar contains a valid value
@@ -97,10 +97,10 @@ class scalar {
   bool const* validity_data() const;
 
  protected:
-  data_type _type{type_id::EMPTY};       ///< Logical type of value in the scalar
-  rmm::device_scalar<bool> _is_valid{};  ///< Device bool signifying validity
+  data_type _type{type_id::EMPTY};     ///< Logical type of value in the scalar
+  rmm::device_scalar<bool> _is_valid;  ///< Device bool signifying validity
 
-  scalar() = default;
+  scalar() = delete;
 
   /**
    * @brief Construct a new scalar object
@@ -175,9 +175,9 @@ class fixed_width_scalar : public scalar {
   T const* data() const;
 
  protected:
-  rmm::device_scalar<T> _data{};  ///< device memory containing the value
+  rmm::device_scalar<T> _data;  ///< device memory containing the value
 
-  fixed_width_scalar();
+  fixed_width_scalar() = delete;
 
   /**
    * @brief Construct a new fixed width scalar object
@@ -218,7 +218,7 @@ class numeric_scalar : public detail::fixed_width_scalar<T> {
   static_assert(is_numeric<T>(), "Unexpected non-numeric type.");
 
  public:
-  numeric_scalar()                       = default;
+  numeric_scalar()                       = delete;
   ~numeric_scalar()                      = default;
   numeric_scalar(numeric_scalar&& other) = default;
 
@@ -276,7 +276,7 @@ class fixed_point_scalar : public scalar {
   using rep_type   = typename T::rep;
   using value_type = T;
 
-  fixed_point_scalar();
+  fixed_point_scalar()                           = delete;
   ~fixed_point_scalar()                          = default;
   fixed_point_scalar(fixed_point_scalar&& other) = default;
 
@@ -375,7 +375,7 @@ class fixed_point_scalar : public scalar {
   rep_type const* data() const;
 
  protected:
-  rmm::device_scalar<rep_type> _data{};  ///< device memory containing the value
+  rmm::device_scalar<rep_type> _data;  ///< device memory containing the value
 };
 
 /**
@@ -385,10 +385,11 @@ class string_scalar : public scalar {
  public:
   using value_type = cudf::string_view;
 
-  string_scalar();
+  string_scalar()                      = delete;
   ~string_scalar()                     = default;
   string_scalar(string_scalar&& other) = default;
 
+  // string_scalar(string_scalar const& other) = delete;
   string_scalar& operator=(string_scalar const& other) = delete;
   string_scalar& operator=(string_scalar&& other) = delete;
 
@@ -488,7 +489,7 @@ class chrono_scalar : public detail::fixed_width_scalar<T> {
   static_assert(is_chrono<T>(), "Unexpected non-chrono type");
 
  public:
-  chrono_scalar()                      = default;
+  chrono_scalar()                      = delete;
   ~chrono_scalar()                     = default;
   chrono_scalar(chrono_scalar&& other) = default;
 
@@ -540,7 +541,7 @@ class timestamp_scalar : public chrono_scalar<T> {
   using chrono_scalar<T>::chrono_scalar;
   using rep_type = typename T::rep;
 
-  timestamp_scalar()                         = default;
+  timestamp_scalar()                         = delete;
   timestamp_scalar(timestamp_scalar&& other) = default;
 
   /**
@@ -583,7 +584,7 @@ class duration_scalar : public chrono_scalar<T> {
   using chrono_scalar<T>::chrono_scalar;
   using rep_type = typename T::rep;
 
-  duration_scalar()                        = default;
+  duration_scalar()                        = delete;
   duration_scalar(duration_scalar&& other) = default;
 
   /**
@@ -621,7 +622,7 @@ class duration_scalar : public chrono_scalar<T> {
  */
 class list_scalar : public scalar {
  public:
-  list_scalar();
+  list_scalar()                    = delete;
   ~list_scalar()                   = default;
   list_scalar(list_scalar&& other) = default;
 
@@ -681,7 +682,7 @@ class list_scalar : public scalar {
  */
 class struct_scalar : public scalar {
  public:
-  struct_scalar();
+  struct_scalar()                           = delete;
   ~struct_scalar()                          = default;
   struct_scalar(struct_scalar&& other)      = default;
   struct_scalar(struct_scalar const& other) = default;
diff --git a/cpp/src/copying/get_element.cu b/cpp/src/copying/get_element.cu
index a4d863d204d..9fa5f6bf17b 100644
--- a/cpp/src/copying/get_element.cu
+++ b/cpp/src/copying/get_element.cu
@@ -66,8 +66,8 @@ struct get_element_functor {
   {
     auto device_col = column_device_view::create(input, stream);
 
-    rmm::device_scalar<string_view> temp_data;
-    rmm::device_scalar<bool> temp_valid;
+    rmm::device_scalar<string_view> temp_data(stream, mr);
+    rmm::device_scalar<bool> temp_valid(stream, mr);
 
     device_single_thread(
       [buffer   = temp_data.data(),
@@ -105,7 +105,7 @@ struct get_element_functor {
 
     if (!key_index_scalar.is_valid(stream)) {
       auto null_result = make_default_constructed_scalar(dict_view.keys().type(), stream, mr);
-      null_result->set_valid(false, stream);
+      null_result->set_valid_async(false, stream);
       return null_result;
     }
 
@@ -154,8 +154,8 @@ struct get_element_functor {
 
     auto device_col = column_device_view::create(input, stream);
 
-    rmm::device_scalar<Type> temp_data;
-    rmm::device_scalar<bool> temp_valid;
+    rmm::device_scalar<Type> temp_data(stream, mr);
+    rmm::device_scalar<bool> temp_valid(stream, mr);
 
     device_single_thread(
       [buffer   = temp_data.data(),
diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu
index 89bb05f7875..c3fcb9f613f 100644
--- a/cpp/src/io/json/reader_impl.cu
+++ b/cpp/src/io/json/reader_impl.cu
@@ -147,7 +147,7 @@ std::unique_ptr<table> create_json_keys_info_table(const parse_options_view &opt
     options, data, row_offsets, key_counter.data(), {}, stream);
 
   // Allocate columns to store hash value, length, and offset of each JSON object key in the input
-  auto const num_keys = key_counter.value();
+  auto const num_keys = key_counter.value(stream);
   std::vector<std::unique_ptr<column>> info_columns;
   info_columns.emplace_back(make_numeric_column(data_type(type_id::UINT64), num_keys));
   info_columns.emplace_back(make_numeric_column(data_type(type_id::UINT16), num_keys));
@@ -157,7 +157,7 @@ std::unique_ptr<table> create_json_keys_info_table(const parse_options_view &opt
   auto const info_table_mdv = mutable_table_device_view::create(info_table->mutable_view(), stream);
 
   // Reset the key counter - now used for indexing
-  key_counter.set_value_zero(stream);
+  key_counter.set_value_to_zero_async(stream);
   // Fill the allocated columns
   cudf::io::json::gpu::collect_keys_info(
     options, data, row_offsets, key_counter.data(), {*info_table_mdv}, stream);
@@ -433,7 +433,7 @@ void reader::impl::set_column_names(device_span<uint64_t const> rec_starts,
     // use keys as column names if input rows are objects
     auto keys_desc         = get_json_object_keys_hashes(rec_starts, stream);
     metadata_.column_names = keys_desc.first;
-    set_column_map(std::move(keys_desc.second));
+    set_column_map(std::move(keys_desc.second), stream);
   } else {
     int cols_found = 0;
     bool quotation = false;
diff --git a/cpp/src/io/json/reader_impl.hpp b/cpp/src/io/json/reader_impl.hpp
index cb413630d07..fa3c34586d9 100644
--- a/cpp/src/io/json/reader_impl.hpp
+++ b/cpp/src/io/json/reader_impl.hpp
@@ -87,10 +87,11 @@ class reader::impl {
    * @brief Sets the column map data member and makes a device copy to be used as a kernel
    * parameter.
    */
-  void set_column_map(col_map_ptr_type &&map)
+  void set_column_map(col_map_ptr_type &&map, rmm::cuda_stream_view stream)
   {
     key_to_col_idx_map_ = std::move(map);
-    d_key_col_map_      = std::make_unique<rmm::device_scalar<col_map_type>>(*key_to_col_idx_map_);
+    d_key_col_map_ =
+      std::make_unique<rmm::device_scalar<col_map_type>>(*key_to_col_idx_map_, stream);
   }
   /**
    * @brief Gets the pointer to the column hash map in the device memory.
diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu
index 2624ea68629..3f59bc13dda 100644
--- a/cpp/src/join/hash_join.cu
+++ b/cpp/src/join/hash_join.cu
@@ -255,7 +255,7 @@ probe_join_hash_table(cudf::table_device_view build_table,
 
     constexpr int block_size{DEFAULT_JOIN_BLOCK_SIZE};
     detail::grid_1d config(probe_table.num_rows(), block_size);
-    write_index.set_value_zero(stream);
+    write_index.set_value_to_zero_async(stream);
 
     row_hash hash_probe{probe_table};
     row_equality equality{probe_table, build_table, compare_nulls == null_equality::EQUAL};
diff --git a/cpp/src/join/hash_join.cuh b/cpp/src/join/hash_join.cuh
index e6df2b58b15..8fefda9f841 100644
--- a/cpp/src/join/hash_join.cuh
+++ b/cpp/src/join/hash_join.cuh
@@ -122,7 +122,7 @@ std::size_t estimate_join_output_size(table_device_view build_table,
   do {
     sample_probe_num_rows = std::min(sample_probe_num_rows, probe_table_num_rows);
 
-    size_estimate.set_value_zero(stream);
+    size_estimate.set_value_to_zero_async(stream);
 
     row_hash hash_probe{probe_table};
     row_equality equality{probe_table, build_table, compare_nulls == null_equality::EQUAL};
diff --git a/cpp/src/reductions/compound.cuh b/cpp/src/reductions/compound.cuh
index 09d812e5d94..34d8912bc41 100644
--- a/cpp/src/reductions/compound.cuh
+++ b/cpp/src/reductions/compound.cuh
@@ -78,7 +78,7 @@ std::unique_ptr<scalar> compound_reduction(column_view const& col,
   }
 
   // set scalar is valid
-  result->set_valid(col.null_count() < col.size(), stream);
+  result->set_valid_async(col.null_count() < col.size(), stream);
   return result;
 };
 
diff --git a/cpp/src/reductions/reductions.cpp b/cpp/src/reductions/reductions.cpp
index 083b0da8cf3..00539b6d7a5 100644
--- a/cpp/src/reductions/reductions.cpp
+++ b/cpp/src/reductions/reductions.cpp
@@ -113,7 +113,7 @@ std::unique_ptr<scalar> reduce(
   rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource())
 {
   std::unique_ptr<scalar> result = make_default_constructed_scalar(output_dtype, stream, mr);
-  result->set_valid(false, stream);
+  result->set_valid_async(false, stream);
 
   // check if input column is empty
   if (col.size() <= col.null_count()) return result;
diff --git a/cpp/src/reductions/simple.cuh b/cpp/src/reductions/simple.cuh
index baaedda7d63..61002481ddc 100644
--- a/cpp/src/reductions/simple.cuh
+++ b/cpp/src/reductions/simple.cuh
@@ -67,7 +67,7 @@ std::unique_ptr<scalar> simple_reduction(column_view const& col,
   }();
 
   // set scalar is valid
-  result->set_valid((col.null_count() < col.size()), stream);
+  result->set_valid_async(col.null_count() < col.size(), stream);
   return result;
 }
 
@@ -147,7 +147,7 @@ std::unique_ptr<scalar> dictionary_reduction(column_view const& col,
   }();
 
   // set scalar is valid
-  result->set_valid((col.null_count() < col.size()), stream);
+  result->set_valid_async(col.null_count() < col.size(), stream);
   return result;
 }
 
diff --git a/cpp/src/scalar/scalar.cpp b/cpp/src/scalar/scalar.cpp
index 9189634b5d8..653164161e8 100644
--- a/cpp/src/scalar/scalar.cpp
+++ b/cpp/src/scalar/scalar.cpp
@@ -46,9 +46,9 @@ scalar::scalar(scalar const& other,
 
 data_type scalar::type() const noexcept { return _type; }
 
-void scalar::set_valid(bool is_valid, rmm::cuda_stream_view stream)
+void scalar::set_valid_async(bool is_valid, rmm::cuda_stream_view stream)
 {
-  _is_valid.set_value(is_valid, stream);
+  _is_valid.set_value_async(is_valid, stream);
 }
 
 bool scalar::is_valid(rmm::cuda_stream_view stream) const { return _is_valid.value(stream); }
@@ -57,8 +57,6 @@ bool* scalar::validity_data() { return _is_valid.data(); }
 
 bool const* scalar::validity_data() const { return _is_valid.data(); }
 
-string_scalar::string_scalar() : scalar(data_type(type_id::STRING)) {}
-
 string_scalar::string_scalar(std::string const& string,
                              bool is_valid,
                              rmm::cuda_stream_view stream,
@@ -112,9 +110,6 @@ std::string string_scalar::to_string(rmm::cuda_stream_view stream) const
   return result;
 }
 
-template <typename T>
-fixed_point_scalar<T>::fixed_point_scalar() : scalar(data_type(type_to_id<T>())){};
-
 template <typename T>
 fixed_point_scalar<T>::fixed_point_scalar(rep_type value,
                                           numeric::scale_type scale,
@@ -122,7 +117,7 @@ fixed_point_scalar<T>::fixed_point_scalar(rep_type value,
                                           rmm::cuda_stream_view stream,
                                           rmm::mr::device_memory_resource* mr)
   : scalar{data_type{type_to_id<T>(), static_cast<int32_t>(scale)}, is_valid, stream, mr},
-    _data{value}
+    _data{value, stream, mr}
 {
 }
 
@@ -131,7 +126,7 @@ fixed_point_scalar<T>::fixed_point_scalar(rep_type value,
                                           bool is_valid,
                                           rmm::cuda_stream_view stream,
                                           rmm::mr::device_memory_resource* mr)
-  : scalar{data_type{type_to_id<T>(), 0}, is_valid, stream, mr}, _data{value}
+  : scalar{data_type{type_to_id<T>(), 0}, is_valid, stream, mr}, _data{value, stream, mr}
 {
 }
 
@@ -140,7 +135,8 @@ fixed_point_scalar<T>::fixed_point_scalar(T value,
                                           bool is_valid,
                                           rmm::cuda_stream_view stream,
                                           rmm::mr::device_memory_resource* mr)
-  : scalar{data_type{type_to_id<T>(), value.scale()}, is_valid, stream, mr}, _data{value.value()}
+  : scalar{data_type{type_to_id<T>(), value.scale()}, is_valid, stream, mr},
+    _data{value.value(), stream, mr}
 {
 }
 
@@ -202,11 +198,6 @@ template class fixed_point_scalar<numeric::decimal64>;
 
 namespace detail {
 
-template <typename T>
-fixed_width_scalar<T>::fixed_width_scalar() : scalar(data_type(type_to_id<T>()))
-{
-}
-
 template <typename T>
 fixed_width_scalar<T>::fixed_width_scalar(T value,
                                           bool is_valid,
@@ -237,8 +228,8 @@ fixed_width_scalar<T>::fixed_width_scalar(fixed_width_scalar<T> const& other,
 template <typename T>
 void fixed_width_scalar<T>::set_value(T value, rmm::cuda_stream_view stream)
 {
-  _data.set_value(value, stream);
-  this->set_valid(true, stream);
+  _data.set_value_async(value, stream);
+  this->set_valid_async(true, stream);
 }
 
 template <typename T>
@@ -491,8 +482,6 @@ TS_CTOR(timestamp_ns, duration_us)
 TS_CTOR(timestamp_ns, duration_ns)
 TS_CTOR(timestamp_ns, int64_t)
 
-list_scalar::list_scalar() : scalar(data_type(type_id::LIST)) {}
-
 list_scalar::list_scalar(cudf::column_view const& data,
                          bool is_valid,
                          rmm::cuda_stream_view stream,
@@ -518,8 +507,6 @@ list_scalar::list_scalar(list_scalar const& other,
 
 column_view list_scalar::view() const { return _data.view(); }
 
-struct_scalar::struct_scalar() : scalar(data_type(type_id::STRUCT)) {}
-
 struct_scalar::struct_scalar(table_view const& data,
                              bool is_valid,
                              rmm::cuda_stream_view stream,
diff --git a/cpp/src/strings/attributes.cu b/cpp/src/strings/attributes.cu
index bed86544ec7..997265ecfed 100644
--- a/cpp/src/strings/attributes.cu
+++ b/cpp/src/strings/attributes.cu
@@ -150,8 +150,8 @@ std::unique_ptr<column> code_points(
       return length;
     },
     thrust::plus<size_type>());
-  size_type const zero = 0;
-  offsets.set_element_async(0, zero, stream);
+
+  offsets.set_element_to_zero_async(0, stream);
 
   // the total size is the number of characters in the entire column
   size_type num_characters = offsets.back_element(stream);
diff --git a/cpp/src/strings/combine/join.cu b/cpp/src/strings/combine/join.cu
index ffe95118f19..97a4ebf9be4 100644
--- a/cpp/src/strings/combine/join.cu
+++ b/cpp/src/strings/combine/join.cu
@@ -73,11 +73,9 @@ std::unique_ptr<column> join_strings(strings_column_view const& strings,
       return bytes;
     },
     thrust::plus<size_type>());
-  size_type const zero = 0;
-  output_offsets.set_element_async(0, zero, stream);
+
+  output_offsets.set_element_to_zero_async(0, stream);
   // total size is the last entry
-  // Note this call does a synchronize on the stream and thereby also protects the
-  // set_element_async parameter from going out of scope before it is used.
   size_type const bytes = output_offsets.back_element(stream);
 
   // build offsets column (only 1 string so 2 offset entries)
diff --git a/cpp/src/strings/copying/concatenate.cu b/cpp/src/strings/copying/concatenate.cu
index 2c7d6ebb483..7d06d773519 100644
--- a/cpp/src/strings/copying/concatenate.cu
+++ b/cpp/src/strings/copying/concatenate.cu
@@ -90,8 +90,7 @@ auto create_strings_device_views(host_span<column_view const> views, rmm::cuda_s
   // Compute the partition offsets and size of chars column
   // Note: Using 64-bit size_t so we can detect overflow of 32-bit size_type
   auto d_partition_offsets = rmm::device_uvector<size_t>(views.size() + 1, stream);
-  size_t zero{0};
-  d_partition_offsets.set_element_async(0, zero, stream);  // zero first element
+  d_partition_offsets.set_element_to_zero_async(0, stream);  // zero first element
 
   thrust::transform_inclusive_scan(rmm::exec_policy(stream),
                                    device_views_ptr,
@@ -246,7 +245,7 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
   }
 
   {  // Copy offsets columns with single kernel launch
-    rmm::device_scalar<size_type> d_valid_count(0);
+    rmm::device_scalar<size_type> d_valid_count(0, stream);
 
     constexpr size_type block_size{256};
     cudf::detail::grid_1d config(offsets_count, block_size);
diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu
index 20d4aa2a307..8de9915a668 100644
--- a/cpp/src/strings/json/json_path.cu
+++ b/cpp/src/strings/json/json_path.cu
@@ -988,7 +988,7 @@ std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& c
   return make_strings_column(col.size(),
                              std::move(offsets),
                              std::move(chars),
-                             col.size() - d_valid_count.value(),
+                             col.size() - d_valid_count.value(stream),
                              std::move(validity),
                              stream,
                              mr);
diff --git a/cpp/src/text/ngrams_tokenize.cu b/cpp/src/text/ngrams_tokenize.cu
index bdcaac4d4ea..36136ef89fa 100644
--- a/cpp/src/text/ngrams_tokenize.cu
+++ b/cpp/src/text/ngrams_tokenize.cu
@@ -165,8 +165,7 @@ std::unique_ptr<cudf::column> ngrams_tokenize(
                                    d_token_offsets + 1,
                                    strings_tokenizer{d_strings, d_delimiter},
                                    thrust::plus<int32_t>());
-  int32_t const zero = 0;
-  token_offsets.set_element_async(0, zero, stream);
+  token_offsets.set_element_to_zero_async(0, stream);
   auto const total_tokens = token_offsets.back_element(stream);  // Ex. 5 tokens
 
   // get the token positions (in bytes) per string
@@ -193,7 +192,7 @@ std::unique_ptr<cudf::column> ngrams_tokenize(
       return (token_count >= ngrams) ? token_count - ngrams + 1 : 0;
     },
     thrust::plus<int32_t>());
-  ngram_offsets.set_element_async(0, zero, stream);
+  ngram_offsets.set_element_to_zero_async(0, stream);
   auto const total_ngrams = ngram_offsets.back_element(stream);
 
   // Compute the total size of the ngrams for each string (not for each ngram)
@@ -213,7 +212,7 @@ std::unique_ptr<cudf::column> ngrams_tokenize(
     d_chars_offsets + 1,
     ngram_builder_fn{d_strings, d_separator, ngrams, d_token_offsets, d_token_positions},
     thrust::plus<int32_t>());
-  chars_offsets.set_element_async(0, zero, stream);
+  chars_offsets.set_element_to_zero_async(0, stream);
   auto const output_chars_size = chars_offsets.back_element(stream);  // Ex. 14 output bytes total
 
   rmm::device_uvector<int32_t> ngram_sizes(total_ngrams, stream);  // size in bytes of each
diff --git a/cpp/src/text/tokenize.cu b/cpp/src/text/tokenize.cu
index d66b45618aa..0a8b3f5bb48 100644
--- a/cpp/src/text/tokenize.cu
+++ b/cpp/src/text/tokenize.cu
@@ -77,8 +77,7 @@ std::unique_ptr<cudf::column> tokenize_fn(cudf::size_type strings_count,
                          d_token_counts.template begin<int32_t>(),
                          d_token_counts.template end<int32_t>(),
                          token_offsets.begin() + 1);
-  int32_t const zero = 0;
-  token_offsets.set_element_async(0, zero, stream);
+  token_offsets.set_element_to_zero_async(0, stream);
   auto const total_tokens = token_offsets.back_element(stream);
   // build a list of pointers to each token
   rmm::device_uvector<string_index_pair> tokens(total_tokens, stream);
diff --git a/cpp/tests/binaryop/binop-integration-test.cpp b/cpp/tests/binaryop/binop-integration-test.cpp
index 019e72d3d3f..3f4cb073eb5 100644
--- a/cpp/tests/binaryop/binop-integration-test.cpp
+++ b/cpp/tests/binaryop/binop-integration-test.cpp
@@ -1101,7 +1101,7 @@ TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Vector_ScalarInvalid_B8_SI
   auto int_col    = fixed_width_column_wrapper<TypeLhs>{{-INT32_MAX, -37, 0, 499, 44, INT32_MAX},
                                                      {false, true, false, true, true, false}};
   auto int_scalar = cudf::scalar_type_t<TypeRhs>(999);
-  int_scalar.set_valid(false);
+  int_scalar.set_valid_async(false);
 
   auto op_col = cudf::binary_operation(
     int_col, int_scalar, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
@@ -1217,7 +1217,7 @@ TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Scalar_Vector_B8_string_st
                                                     {true, true, true, true, true, true, true});
   // Matching a scalar that is invalid
   cudf::string_scalar str_scalar("foo");
-  str_scalar.set_valid(false);
+  str_scalar.set_valid_async(false);
 
   auto op_col = cudf::binary_operation(
     str_scalar, str_col, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
@@ -1262,7 +1262,7 @@ TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Scalar_Vector_B8_string_st
                                        {false, false, false, false, false, false, false});
   // Matching a scalar that is invalid
   cudf::string_scalar str_scalar("foo");
-  str_scalar.set_valid(false);
+  str_scalar.set_valid_async(false);
 
   auto op_col = cudf::binary_operation(
     str_scalar, str_col, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
@@ -1303,7 +1303,7 @@ TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Vector_InvalidScalar_B8_st
                                                     {true, false, true, true, true, false, true});
   // Valid string invalidated
   cudf::string_scalar str_scalar("bb");
-  str_scalar.set_valid(false);
+  str_scalar.set_valid_async(false);
 
   auto op_col = cudf::binary_operation(
     str_col, str_scalar, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
@@ -1553,7 +1553,7 @@ TEST_F(BinaryOperationIntegrationTest, NullAwareMin_Vector_Scalar_SI64_SI32_FP32
     fixed_width_column_wrapper<TypeLhs>{{999, -37, 0, INT32_MAX, -INT32_MAX, -4379, 55},
                                         {false, true, false, true, false, true, false}};
   auto float_scalar = cudf::scalar_type_t<TypeRhs>(-3.14f);
-  float_scalar.set_valid(false);
+  float_scalar.set_valid_async(false);
 
   auto op_col = cudf::binary_operation(
     int_col, float_scalar, cudf::binary_operator::NULL_MIN, data_type(type_to_id<TypeOut>()));
@@ -1575,7 +1575,7 @@ TEST_F(BinaryOperationIntegrationTest, NullAwareMax_Scalar_Vector_SI8_SI8_FP32)
   auto int_col = fixed_width_column_wrapper<TypeLhs>{
     {9, -37, 0, 32, -47, -4, 55}, {false, false, false, false, false, false, false}};
   auto float_scalar = cudf::scalar_type_t<TypeRhs>(-3.14f);
-  float_scalar.set_valid(false);
+  float_scalar.set_valid_async(false);
 
   auto op_col = cudf::binary_operation(
     float_scalar, int_col, cudf::binary_operator::NULL_MAX, data_type(type_to_id<TypeOut>()));
@@ -1728,7 +1728,7 @@ TEST_F(BinaryOperationIntegrationTest, NullAwareMax_Scalar_Vector_string_string_
     {"eee", "invalid", "<null>", "", "", "", "ééé", "foo", "bar", "abc", "foo"},
     {false, true, true, false, true, true, true, false, false, true, true});
   cudf::string_scalar str_scalar("foo");
-  str_scalar.set_valid(false);
+  str_scalar.set_valid_async(false);
 
   // Returns the lhs_col
   auto op_col = cudf::binary_operation(
diff --git a/cpp/tests/binaryop/binop-null-test.cpp b/cpp/tests/binaryop/binop-null-test.cpp
index 184bafc8c57..c91bc12d95f 100644
--- a/cpp/tests/binaryop/binop-null-test.cpp
+++ b/cpp/tests/binaryop/binop-null-test.cpp
@@ -63,7 +63,7 @@ TEST_F(BinaryOperationNullTest, Scalar_Null_Vector_Valid)
   using ADD = cudf::library::operation::Add<TypeOut, TypeLhs, TypeRhs>;
 
   auto lhs = make_random_wrapped_scalar<TypeLhs>();
-  lhs.set_valid(false);
+  lhs.set_valid_async(false);
   auto rhs = make_random_wrapped_column<TypeRhs>(100, mask_state::ALL_VALID);
 
   auto out =
@@ -98,7 +98,7 @@ TEST_F(BinaryOperationNullTest, Scalar_Null_Vector_NonNullable)
   using ADD = cudf::library::operation::Add<TypeOut, TypeLhs, TypeRhs>;
 
   auto lhs = make_random_wrapped_scalar<TypeLhs>();
-  lhs.set_valid(false);
+  lhs.set_valid_async(false);
   auto rhs = make_random_wrapped_column<TypeRhs>(100, mask_state::UNALLOCATED);
 
   auto out =
diff --git a/cpp/tests/copying/scatter_tests.cpp b/cpp/tests/copying/scatter_tests.cpp
index 500c017ca85..be4a689f213 100644
--- a/cpp/tests/copying/scatter_tests.cpp
+++ b/cpp/tests/copying/scatter_tests.cpp
@@ -738,7 +738,7 @@ struct BooleanMaskScalarScatter : public cudf::test::BaseFixture {
     }
 
     static_cast<ScalarType*>(scalar.get())->set_value(value);
-    static_cast<ScalarType*>(scalar.get())->set_valid(validity);
+    static_cast<ScalarType*>(scalar.get())->set_valid_async(validity);
 
     return scalar;
   }
@@ -774,7 +774,7 @@ TYPED_TEST(BooleanMaskScalarScatter, WithNull)
   bool validity = false;
   auto scalar_1 = this->form_scalar(source, validity);
   auto scalar_2 = cudf::make_string_scalar("cudf");
-  scalar_2->set_valid(true);
+  scalar_2->set_valid_async(true);
   std::vector<std::reference_wrapper<const cudf::scalar>> scalar_vect;
   scalar_vect.push_back(*scalar_1);
   scalar_vect.push_back(*scalar_2);
@@ -804,7 +804,7 @@ class BooleanMaskScatterScalarString : public cudf::test::BaseFixture {
 TEST_F(BooleanMaskScatterScalarString, NoNUll)
 {
   auto scalar = cudf::make_string_scalar("cudf");
-  scalar->set_valid(true);
+  scalar->set_valid_async(true);
   std::vector<std::reference_wrapper<const cudf::scalar>> scalar_vect;
   scalar_vect.push_back(*scalar);
 
@@ -823,7 +823,7 @@ TEST_F(BooleanMaskScatterScalarString, NoNUll)
 TEST_F(BooleanMaskScatterScalarString, WithNUll)
 {
   auto scalar = cudf::make_string_scalar("cudf");
-  scalar->set_valid(true);
+  scalar->set_valid_async(true);
   std::vector<std::reference_wrapper<const cudf::scalar>> scalar_vect;
   scalar_vect.push_back(*scalar);
   cudf::test::strings_column_wrapper target({"is", "is", "a", "udf", "api"}, {1, 0, 0, 1, 1});
diff --git a/cpp/tests/filling/fill_tests.cpp b/cpp/tests/filling/fill_tests.cpp
index 6fedf7fea05..75c0cad20e7 100644
--- a/cpp/tests/filling/fill_tests.cpp
+++ b/cpp/tests/filling/fill_tests.cpp
@@ -68,7 +68,7 @@ class FillTypedTestFixture : public cudf::test::BaseFixture {
     }
     using ScalarType = cudf::scalar_type_t<T>;
     static_cast<ScalarType*>(p_val.get())->set_value(value);
-    static_cast<ScalarType*>(p_val.get())->set_valid(value_is_valid);
+    static_cast<ScalarType*>(p_val.get())->set_valid_async(value_is_valid);
 
     auto expected_elements =
       cudf::detail::make_counting_transform_iterator(0, [begin, end, value](auto i) {
@@ -189,7 +189,7 @@ class FillStringTestFixture : public cudf::test::BaseFixture {
 
     auto p_val       = cudf::make_string_scalar(value);
     using ScalarType = cudf::scalar_type_t<cudf::string_view>;
-    static_cast<ScalarType*>(p_val.get())->set_valid(value_is_valid);
+    static_cast<ScalarType*>(p_val.get())->set_valid_async(value_is_valid);
 
     auto p_chars   = value.c_str();
     auto num_chars = value.length();
@@ -285,7 +285,7 @@ TEST_F(FillErrorTestFixture, InvalidInplaceCall)
   using T_int      = cudf::id_to_type<cudf::type_id::INT32>;
   using ScalarType = cudf::scalar_type_t<T_int>;
   static_cast<ScalarType*>(p_val_int.get())->set_value(5);
-  static_cast<ScalarType*>(p_val_int.get())->set_valid(false);
+  static_cast<ScalarType*>(p_val_int.get())->set_valid_async(false);
 
   auto destination = cudf::test::fixed_width_column_wrapper<int32_t>(
     thrust::make_counting_iterator(0), thrust::make_counting_iterator(0) + 100);
diff --git a/cpp/tests/lists/contains_tests.cpp b/cpp/tests/lists/contains_tests.cpp
index 73194271a32..fe0a3cb6084 100644
--- a/cpp/tests/lists/contains_tests.cpp
+++ b/cpp/tests/lists/contains_tests.cpp
@@ -46,7 +46,7 @@ template <typename T, std::enable_if_t<cudf::is_numeric<T>(), void>* = nullptr>
 auto create_scalar_search_key(T const& value)
 {
   auto search_key = make_numeric_scalar(data_type{type_to_id<T>()});
-  search_key->set_valid(true);
+  search_key->set_valid_async(true);
   static_cast<scalar_type_t<T>*>(search_key.get())->set_value(value);
   return search_key;
 }
@@ -61,7 +61,7 @@ template <typename T, std::enable_if_t<cudf::is_timestamp<T>(), void>* = nullptr
 auto create_scalar_search_key(typename T::rep const& value)
 {
   auto search_key = make_timestamp_scalar(data_type{type_to_id<T>()});
-  search_key->set_valid(true);
+  search_key->set_valid_async(true);
   static_cast<scalar_type_t<typename T::rep>*>(search_key.get())->set_value(value);
   return search_key;
 }
@@ -70,7 +70,7 @@ template <typename T, std::enable_if_t<cudf::is_duration<T>(), void>* = nullptr>
 auto create_scalar_search_key(typename T::rep const& value)
 {
   auto search_key = make_duration_scalar(data_type{type_to_id<T>()});
-  search_key->set_valid(true);
+  search_key->set_valid_async(true);
   static_cast<scalar_type_t<typename T::rep>*>(search_key.get())->set_value(value);
   return search_key;
 }
@@ -79,7 +79,7 @@ template <typename T, std::enable_if_t<cudf::is_numeric<T>(), void>* = nullptr>
 auto create_null_search_key()
 {
   auto search_key = make_numeric_scalar(data_type{type_to_id<T>()});
-  search_key->set_valid(false);
+  search_key->set_valid_async(false);
   return search_key;
 }
 
@@ -87,7 +87,7 @@ template <typename T, std::enable_if_t<cudf::is_timestamp<T>(), void>* = nullptr
 auto create_null_search_key()
 {
   auto search_key = make_timestamp_scalar(data_type{type_to_id<T>()});
-  search_key->set_valid(false);
+  search_key->set_valid_async(false);
   return search_key;
 }
 
@@ -95,7 +95,7 @@ template <typename T, std::enable_if_t<cudf::is_duration<T>(), void>* = nullptr>
 auto create_null_search_key()
 {
   auto search_key = make_duration_scalar(data_type{type_to_id<T>()});
-  search_key->set_valid(false);
+  search_key->set_valid_async(false);
   return search_key;
 }
 
diff --git a/cpp/tests/replace/clamp_test.cpp b/cpp/tests/replace/clamp_test.cpp
index 499745c7dc4..ecc36b3af20 100644
--- a/cpp/tests/replace/clamp_test.cpp
+++ b/cpp/tests/replace/clamp_test.cpp
@@ -33,9 +33,9 @@ struct ClampErrorTest : public cudf::test::BaseFixture {
 TEST_F(ClampErrorTest, MisMatchingScalarTypes)
 {
   auto lo = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32));
-  lo->set_valid(true);
+  lo->set_valid_async(true);
   auto hi = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT64));
-  hi->set_valid(true);
+  hi->set_valid_async(true);
 
   cudf::test::fixed_width_column_wrapper<int32_t> input({1, 2, 3, 4, 5, 6});
 
@@ -45,9 +45,9 @@ TEST_F(ClampErrorTest, MisMatchingScalarTypes)
 TEST_F(ClampErrorTest, MisMatchingInputAndScalarTypes)
 {
   auto lo = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32));
-  lo->set_valid(true);
+  lo->set_valid_async(true);
   auto hi = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32));
-  hi->set_valid(true);
+  hi->set_valid_async(true);
 
   cudf::test::fixed_width_column_wrapper<int64_t> input({1, 2, 3, 4, 5, 6});
 
@@ -57,13 +57,13 @@ TEST_F(ClampErrorTest, MisMatchingInputAndScalarTypes)
 TEST_F(ClampErrorTest, MisMatchingReplaceScalarTypes)
 {
   auto lo = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32));
-  lo->set_valid(true);
+  lo->set_valid_async(true);
   auto hi = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32));
-  hi->set_valid(true);
+  hi->set_valid_async(true);
   auto lo_replace = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT64));
-  lo_replace->set_valid(true);
+  lo_replace->set_valid_async(true);
   auto hi_replace = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32));
-  hi_replace->set_valid(true);
+  hi_replace->set_valid_async(true);
 
   cudf::test::fixed_width_column_wrapper<int64_t> input({1, 2, 3, 4, 5, 6});
 
@@ -73,13 +73,13 @@ TEST_F(ClampErrorTest, MisMatchingReplaceScalarTypes)
 TEST_F(ClampErrorTest, InValidCase1)
 {
   auto lo = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32));
-  lo->set_valid(true);
+  lo->set_valid_async(true);
   auto hi = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32));
-  hi->set_valid(true);
+  hi->set_valid_async(true);
   auto lo_replace = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32));
-  lo_replace->set_valid(false);
+  lo_replace->set_valid_async(false);
   auto hi_replace = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32));
-  hi_replace->set_valid(true);
+  hi_replace->set_valid_async(true);
 
   cudf::test::fixed_width_column_wrapper<int64_t> input({1, 2, 3, 4, 5, 6});
 
@@ -89,13 +89,13 @@ TEST_F(ClampErrorTest, InValidCase1)
 TEST_F(ClampErrorTest, InValidCase2)
 {
   auto lo = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32));
-  lo->set_valid(true);
+  lo->set_valid_async(true);
   auto hi = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32));
-  hi->set_valid(true);
+  hi->set_valid_async(true);
   auto lo_replace = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32));
-  lo_replace->set_valid(true);
+  lo_replace->set_valid_async(true);
   auto hi_replace = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32));
-  hi_replace->set_valid(false);
+  hi_replace->set_valid_async(false);
 
   cudf::test::fixed_width_column_wrapper<int64_t> input({1, 2, 3, 4, 5, 6});
 
@@ -108,9 +108,9 @@ struct ClampEmptyCaseTest : public cudf::test::BaseFixture {
 TEST_F(ClampEmptyCaseTest, BothScalarEmptyInvalid)
 {
   auto lo = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32));
-  lo->set_valid(false);
+  lo->set_valid_async(false);
   auto hi = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32));
-  hi->set_valid(false);
+  hi->set_valid_async(false);
 
   cudf::test::fixed_width_column_wrapper<int32_t> input({1, 2, 3, 4, 5, 6});
 
@@ -122,9 +122,9 @@ TEST_F(ClampEmptyCaseTest, BothScalarEmptyInvalid)
 TEST_F(ClampEmptyCaseTest, EmptyInput)
 {
   auto lo = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32));
-  lo->set_valid(true);
+  lo->set_valid_async(true);
   auto hi = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32));
-  hi->set_valid(true);
+  hi->set_valid_async(true);
 
   cudf::test::fixed_width_column_wrapper<int32_t> input({});
 
@@ -181,13 +181,13 @@ struct ClampTestNumeric : public cudf::test::BaseFixture {
     }
 
     static_cast<ScalarType*>(lo_scalar.get())->set_value(lo);
-    static_cast<ScalarType*>(lo_scalar.get())->set_valid(lo_validity);
+    static_cast<ScalarType*>(lo_scalar.get())->set_valid_async(lo_validity);
     static_cast<ScalarType*>(lo_replace_scalar.get())->set_value(lo_replace);
-    static_cast<ScalarType*>(lo_replace_scalar.get())->set_valid(lo_replace_validity);
+    static_cast<ScalarType*>(lo_replace_scalar.get())->set_valid_async(lo_replace_validity);
     static_cast<ScalarType*>(hi_scalar.get())->set_value(hi);
-    static_cast<ScalarType*>(hi_scalar.get())->set_valid(hi_validity);
+    static_cast<ScalarType*>(hi_scalar.get())->set_valid_async(hi_validity);
     static_cast<ScalarType*>(hi_replace_scalar.get())->set_value(hi_replace);
-    static_cast<ScalarType*>(hi_replace_scalar.get())->set_valid(hi_replace_validity);
+    static_cast<ScalarType*>(hi_replace_scalar.get())->set_valid_async(hi_replace_validity);
 
     if (input.size() == input_validity.size()) {
       cudf::test::fixed_width_column_wrapper<T> input_column(
@@ -307,9 +307,9 @@ TYPED_TEST(ClampFloatTest, WithNANandNoNull)
     cudf::make_numeric_scalar(cudf::data_type(cudf::data_type{cudf::type_to_id<T>()}));
 
   static_cast<ScalarType*>(lo_scalar.get())->set_value(2.0);
-  static_cast<ScalarType*>(lo_scalar.get())->set_valid(true);
+  static_cast<ScalarType*>(lo_scalar.get())->set_valid_async(true);
   static_cast<ScalarType*>(hi_scalar.get())->set_value(6.0);
-  static_cast<ScalarType*>(hi_scalar.get())->set_valid(true);
+  static_cast<ScalarType*>(hi_scalar.get())->set_valid_async(true);
 
   auto got = cudf::clamp(input, *lo_scalar, *hi_scalar);
   cudf::test::fixed_width_column_wrapper<T> expected(
@@ -332,9 +332,9 @@ TYPED_TEST(ClampFloatTest, WithNANandNull)
     cudf::make_numeric_scalar(cudf::data_type(cudf::data_type{cudf::type_to_id<T>()}));
 
   static_cast<ScalarType*>(lo_scalar.get())->set_value(2.0);
-  static_cast<ScalarType*>(lo_scalar.get())->set_valid(true);
+  static_cast<ScalarType*>(lo_scalar.get())->set_valid_async(true);
   static_cast<ScalarType*>(hi_scalar.get())->set_value(6.0);
-  static_cast<ScalarType*>(hi_scalar.get())->set_valid(true);
+  static_cast<ScalarType*>(hi_scalar.get())->set_valid_async(true);
 
   auto got = cudf::clamp(input, *lo_scalar, *hi_scalar);
   cudf::test::fixed_width_column_wrapper<T> expected(
@@ -362,13 +362,13 @@ TYPED_TEST(ClampFloatTest, SignOfAFloat)
     cudf::make_numeric_scalar(cudf::data_type(cudf::data_type{cudf::type_to_id<T>()}));
 
   static_cast<ScalarType*>(lo_scalar.get())->set_value(0.0);
-  static_cast<ScalarType*>(lo_scalar.get())->set_valid(true);
+  static_cast<ScalarType*>(lo_scalar.get())->set_valid_async(true);
   static_cast<ScalarType*>(hi_scalar.get())->set_value(0.0);
-  static_cast<ScalarType*>(hi_scalar.get())->set_valid(true);
+  static_cast<ScalarType*>(hi_scalar.get())->set_valid_async(true);
   static_cast<ScalarType*>(lo_replace_scalar.get())->set_value(-1.0);
-  static_cast<ScalarType*>(lo_replace_scalar.get())->set_valid(true);
+  static_cast<ScalarType*>(lo_replace_scalar.get())->set_valid_async(true);
   static_cast<ScalarType*>(hi_replace_scalar.get())->set_value(1.0);
-  static_cast<ScalarType*>(hi_replace_scalar.get())->set_valid(true);
+  static_cast<ScalarType*>(hi_replace_scalar.get())->set_valid_async(true);
 
   auto got = cudf::clamp(input, *lo_scalar, *lo_replace_scalar, *hi_scalar, *hi_replace_scalar);
   cudf::test::fixed_width_column_wrapper<T> expected(
@@ -390,8 +390,8 @@ TEST_F(ClampStringTest, WithNullableColumn)
 
   auto lo = cudf::make_string_scalar("B");
   auto hi = cudf::make_string_scalar("e");
-  lo->set_valid(true);
-  hi->set_valid(true);
+  lo->set_valid_async(true);
+  hi->set_valid_async(true);
 
   std::vector<std::string> expected_strings{"B", "b", "c", "D", "e", "F", "G", "H", "i", "e", "B"};
 
@@ -411,8 +411,8 @@ TEST_F(ClampStringTest, WithNonNullableColumn)
 
   auto lo = cudf::make_string_scalar("B");
   auto hi = cudf::make_string_scalar("e");
-  lo->set_valid(true);
-  hi->set_valid(true);
+  lo->set_valid_async(true);
+  hi->set_valid_async(true);
 
   std::vector<std::string> expected_strings{"B", "b", "c", "D", "e", "F", "G", "H", "e", "e", "B"};
 
@@ -432,8 +432,8 @@ TEST_F(ClampStringTest, WithNullableColumnNullLow)
 
   auto lo = cudf::make_string_scalar("B");
   auto hi = cudf::make_string_scalar("e");
-  lo->set_valid(false);
-  hi->set_valid(true);
+  lo->set_valid_async(false);
+  hi->set_valid_async(true);
 
   std::vector<std::string> expected_strings{"A", "b", "c", "D", "e", "F", "G", "H", "i", "e", "B"};
 
@@ -454,8 +454,8 @@ TEST_F(ClampStringTest, WithNullableColumnNullHigh)
 
   auto lo = cudf::make_string_scalar("B");
   auto hi = cudf::make_string_scalar("e");
-  lo->set_valid(true);
-  hi->set_valid(false);
+  lo->set_valid_async(true);
+  hi->set_valid_async(false);
 
   std::vector<std::string> expected_strings{"B", "b", "c", "D", "e", "F", "G", "H", "i", "j", "B"};
 
@@ -476,8 +476,8 @@ TEST_F(ClampStringTest, WithNullableColumnBothLoAndHiNull)
 
   auto lo = cudf::make_string_scalar("B");
   auto hi = cudf::make_string_scalar("e");
-  lo->set_valid(false);
-  hi->set_valid(false);
+  lo->set_valid_async(false);
+  hi->set_valid_async(false);
 
   auto got = cudf::clamp(input, *lo, *hi);
 
@@ -495,10 +495,10 @@ TEST_F(ClampStringTest, WithReplaceString)
   auto lo_replace = cudf::make_string_scalar("Z");
   auto hi         = cudf::make_string_scalar("e");
   auto hi_replace = cudf::make_string_scalar("z");
-  lo->set_valid(true);
-  lo_replace->set_valid(true);
-  hi->set_valid(true);
-  hi_replace->set_valid(true);
+  lo->set_valid_async(true);
+  lo_replace->set_valid_async(true);
+  hi->set_valid_async(true);
+  hi_replace->set_valid_async(true);
 
   std::vector<std::string> expected_strings{"Z", "b", "c", "D", "e", "F", "G", "H", "z", "z", "B"};
 
diff --git a/cpp/tests/replace/replace_nulls_tests.cpp b/cpp/tests/replace/replace_nulls_tests.cpp
index 3f0f546f55e..92342326dca 100644
--- a/cpp/tests/replace/replace_nulls_tests.cpp
+++ b/cpp/tests/replace/replace_nulls_tests.cpp
@@ -171,7 +171,7 @@ TEST_F(ReplaceNullsStringsTest, SimpleReplaceScalar)
   std::vector<cudf::valid_type> input_v{0, 0, 0, 0, 0, 0, 0, 0};
   std::unique_ptr<cudf::scalar> repl =
     cudf::make_string_scalar("rep", rmm::cuda_stream_default, mr());
-  repl->set_valid(true, rmm::cuda_stream_default);
+  repl->set_valid_async(true, rmm::cuda_stream_default);
   std::vector<std::string> expected{"rep", "rep", "rep", "rep", "rep", "rep", "rep", "rep"};
 
   cudf::test::strings_column_wrapper input_w{input.begin(), input.end(), input_v.begin()};
diff --git a/cpp/tests/scalar/scalar_device_view_test.cu b/cpp/tests/scalar/scalar_device_view_test.cu
index d0b6b0db44a..19d5372d93a 100644
--- a/cpp/tests/scalar/scalar_device_view_test.cu
+++ b/cpp/tests/scalar/scalar_device_view_test.cu
@@ -27,6 +27,7 @@
 
 #include <thrust/sequence.h>
 #include <random>
+#include "rmm/cuda_stream_view.hpp"
 
 template <typename T>
 struct TypedScalarDeviceViewTest : public cudf::test::BaseFixture {
@@ -49,13 +50,14 @@ __global__ void test_value(ScalarDeviceViewType s, ScalarDeviceViewType s1, bool
 
 TYPED_TEST(TypedScalarDeviceViewTest, Value)
 {
-  TypeParam value = cudf::test::make_type_param_scalar<TypeParam>(7);
+  TypeParam value  = cudf::test::make_type_param_scalar<TypeParam>(7);
+  TypeParam value1 = cudf::test::make_type_param_scalar<TypeParam>(11);
   cudf::scalar_type_t<TypeParam> s(value);
-  cudf::scalar_type_t<TypeParam> s1;
+  cudf::scalar_type_t<TypeParam> s1{value1};
 
   auto scalar_device_view  = cudf::get_scalar_device_view(s);
   auto scalar_device_view1 = cudf::get_scalar_device_view(s1);
-  rmm::device_scalar<bool> result;
+  rmm::device_scalar<bool> result{rmm::cuda_stream_default};
 
   test_set_value<<<1, 1>>>(scalar_device_view, scalar_device_view1);
   CHECK_CUDA(0);
@@ -66,7 +68,7 @@ TYPED_TEST(TypedScalarDeviceViewTest, Value)
   test_value<<<1, 1>>>(scalar_device_view, scalar_device_view1, result.data());
   CHECK_CUDA(0);
 
-  EXPECT_TRUE(result.value());
+  EXPECT_TRUE(result.value(rmm::cuda_stream_default));
 }
 
 template <typename ScalarDeviceViewType>
@@ -80,12 +82,12 @@ TYPED_TEST(TypedScalarDeviceViewTest, ConstructNull)
   TypeParam value = cudf::test::make_type_param_scalar<TypeParam>(5);
   cudf::scalar_type_t<TypeParam> s(value, false);
   auto scalar_device_view = cudf::get_scalar_device_view(s);
-  rmm::device_scalar<bool> result;
+  rmm::device_scalar<bool> result{rmm::cuda_stream_default};
 
   test_null<<<1, 1>>>(scalar_device_view, result.data());
   CHECK_CUDA(0);
 
-  EXPECT_FALSE(result.value());
+  EXPECT_FALSE(result.value(rmm::cuda_stream_default));
 }
 
 template <typename ScalarDeviceViewType>
@@ -96,9 +98,10 @@ __global__ void test_setnull(ScalarDeviceViewType s)
 
 TYPED_TEST(TypedScalarDeviceViewTest, SetNull)
 {
-  cudf::scalar_type_t<TypeParam> s;
+  TypeParam value = cudf::test::make_type_param_scalar<TypeParam>(5);
+  cudf::scalar_type_t<TypeParam> s{value};
   auto scalar_device_view = cudf::get_scalar_device_view(s);
-  s.set_valid(true);
+  s.set_valid_async(true);
   EXPECT_TRUE(s.is_valid());
 
   test_setnull<<<1, 1>>>(scalar_device_view);
@@ -124,11 +127,11 @@ TEST_F(StringScalarDeviceViewTest, Value)
   cudf::string_scalar s(value);
 
   auto scalar_device_view = cudf::get_scalar_device_view(s);
-  rmm::device_scalar<bool> result;
+  rmm::device_scalar<bool> result{rmm::cuda_stream_default};
   auto value_v = cudf::detail::make_device_uvector_sync(value);
 
   test_string_value<<<1, 1>>>(scalar_device_view, value_v.data(), value.size(), result.data());
   CHECK_CUDA(0);
 
-  EXPECT_TRUE(result.value());
+  EXPECT_TRUE(result.value(rmm::cuda_stream_default));
 }
diff --git a/cpp/tests/scalar/scalar_test.cpp b/cpp/tests/scalar/scalar_test.cpp
index 7a12c2fd27d..2047d815867 100644
--- a/cpp/tests/scalar/scalar_test.cpp
+++ b/cpp/tests/scalar/scalar_test.cpp
@@ -26,6 +26,7 @@
 
 #include <thrust/sequence.h>
 #include <random>
+#include <rmm/cuda_stream_view.hpp>
 
 template <typename T>
 struct TypedScalarTest : public cudf::test::BaseFixture {
@@ -58,8 +59,9 @@ TYPED_TEST(TypedScalarTest, ConstructNull)
 
 TYPED_TEST(TypedScalarTestWithoutFixedPoint, SetValue)
 {
+  TypeParam init  = cudf::test::make_type_param_scalar<TypeParam>(0);
   TypeParam value = cudf::test::make_type_param_scalar<TypeParam>(9);
-  cudf::scalar_type_t<TypeParam> s;
+  cudf::scalar_type_t<TypeParam> s(init, true);
   s.set_value(value);
 
   EXPECT_TRUE(s.is_valid());
@@ -69,9 +71,8 @@ TYPED_TEST(TypedScalarTestWithoutFixedPoint, SetValue)
 TYPED_TEST(TypedScalarTestWithoutFixedPoint, SetNull)
 {
   TypeParam value = cudf::test::make_type_param_scalar<TypeParam>(6);
-  cudf::scalar_type_t<TypeParam> s;
-  s.set_value(value);
-  s.set_valid(false);
+  cudf::scalar_type_t<TypeParam> s(value, true);
+  s.set_valid_async(false);
 
   EXPECT_FALSE(s.is_valid());
 }
@@ -111,13 +112,6 @@ TEST_F(StringScalarTest, DefaultValidity)
   EXPECT_EQ(value, s.to_string());
 }
 
-TEST_F(StringScalarTest, ConstructNull)
-{
-  auto s = cudf::string_scalar();
-
-  EXPECT_FALSE(s.is_valid());
-}
-
 TEST_F(StringScalarTest, CopyConstructor)
 {
   std::string value = "test_string";
@@ -161,13 +155,6 @@ TEST_F(ListScalarTest, DefaultValidityNested)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(data, s.view());
 }
 
-TEST_F(ListScalarTest, ConstructNull)
-{
-  auto s = cudf::list_scalar();
-
-  EXPECT_FALSE(s.is_valid());
-}
-
 TEST_F(ListScalarTest, MoveColumnConstructor)
 {
   auto data = cudf::test::fixed_width_column_wrapper<int32_t>{1, 2, 3};
diff --git a/java/src/main/native/src/ColumnVectorJni.cpp b/java/src/main/native/src/ColumnVectorJni.cpp
index a517fe06c1f..57f5f613da7 100644
--- a/java/src/main/native/src/ColumnVectorJni.cpp
+++ b/java/src/main/native/src/ColumnVectorJni.cpp
@@ -17,18 +17,18 @@
 #include <arrow/api.h>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/concatenate.hpp>
+#include <cudf/detail/interop.hpp>
 #include <cudf/filling.hpp>
-#include <cudf/interop.hpp>
 #include <cudf/hashing.hpp>
-#include <cudf/reshape.hpp>
-#include <cudf/utilities/bit.hpp>
-#include <cudf/detail/interop.hpp>
+#include <cudf/interop.hpp>
 #include <cudf/lists/combine.hpp>
 #include <cudf/lists/detail/concatenate.hpp>
 #include <cudf/lists/lists_column_view.hpp>
+#include <cudf/reshape.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/strings/combine.hpp>
 #include <cudf/structs/structs_column_view.hpp>
+#include <cudf/utilities/bit.hpp>
 
 #include "cudf_jni_apis.hpp"
 #include "dtype_utils.hpp"
@@ -54,13 +54,9 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_sequence(JNIEnv *env, j
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_fromArrow(JNIEnv *env, jclass,
-                                                                   jint j_type,
-                                                                   jlong j_col_length,
-                                                                   jlong j_null_count,
-                                                                   jobject j_data_obj,
-                                                                   jobject j_validity_obj,
-                                                                   jobject j_offsets_obj) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_fromArrow(
+    JNIEnv *env, jclass, jint j_type, jlong j_col_length, jlong j_null_count, jobject j_data_obj,
+    jobject j_validity_obj, jobject j_offsets_obj) {
   try {
     cudf::jni::auto_set_device(env);
     cudf::type_id n_type = static_cast<cudf::type_id>(j_type);
@@ -83,17 +79,22 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_fromArrow(JNIEnv *env,
       offsets_address = env->GetDirectBufferAddress(j_offsets_obj);
       offsets_length = env->GetDirectBufferCapacity(j_offsets_obj);
     }
-    auto data_buffer = arrow::Buffer::Wrap(static_cast<const char *>(data_address), static_cast<int>(data_length));
-    auto null_buffer = arrow::Buffer::Wrap(static_cast<const char *>(validity_address), static_cast<int>(validity_length));
-    auto offsets_buffer = arrow::Buffer::Wrap(static_cast<const char *>(offsets_address), static_cast<int>(offsets_length));
+    auto data_buffer =
+        arrow::Buffer::Wrap(static_cast<const char *>(data_address), static_cast<int>(data_length));
+    auto null_buffer = arrow::Buffer::Wrap(static_cast<const char *>(validity_address),
+                                           static_cast<int>(validity_length));
+    auto offsets_buffer = arrow::Buffer::Wrap(static_cast<const char *>(offsets_address),
+                                              static_cast<int>(offsets_length));
 
     std::shared_ptr<arrow::Array> arrow_array;
     switch (n_type) {
       case cudf::type_id::DECIMAL32:
-        JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "Don't support converting DECIMAL32 yet", 0);
+        JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "Don't support converting DECIMAL32 yet",
+                      0);
         break;
       case cudf::type_id::DECIMAL64:
-        JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "Don't support converting DECIMAL64 yet", 0);
+        JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "Don't support converting DECIMAL64 yet",
+                      0);
         break;
       case cudf::type_id::STRUCT:
         JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "Don't support converting STRUCT yet", 0);
@@ -102,19 +103,23 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_fromArrow(JNIEnv *env,
         JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "Don't support converting LIST yet", 0);
         break;
       case cudf::type_id::DICTIONARY32:
-        JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "Don't support converting DICTIONARY32 yet", 0);
+        JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS,
+                      "Don't support converting DICTIONARY32 yet", 0);
         break;
       case cudf::type_id::STRING:
-        arrow_array = std::make_shared<arrow::StringArray>(j_col_length, offsets_buffer, data_buffer, null_buffer, j_null_count);
+        arrow_array = std::make_shared<arrow::StringArray>(j_col_length, offsets_buffer,
+                                                           data_buffer, null_buffer, j_null_count);
         break;
       default:
         // this handles the primitive types
-        arrow_array = cudf::detail::to_arrow_array(n_type, j_col_length, data_buffer, null_buffer, j_null_count);
+        arrow_array = cudf::detail::to_arrow_array(n_type, j_col_length, data_buffer, null_buffer,
+                                                   j_null_count);
     }
     auto name_and_type = arrow::field("col", arrow_array->type());
     std::vector<std::shared_ptr<arrow::Field>> fields = {name_and_type};
     std::shared_ptr<arrow::Schema> schema = std::make_shared<arrow::Schema>(fields);
-    auto arrow_table = arrow::Table::Make(schema, std::vector<std::shared_ptr<arrow::Array>>{arrow_array});
+    auto arrow_table =
+        arrow::Table::Make(schema, std::vector<std::shared_ptr<arrow::Array>>{arrow_array});
     std::unique_ptr<cudf::table> table_result = cudf::from_arrow(*(arrow_table));
     std::vector<std::unique_ptr<cudf::column>> retCols = table_result->release();
     if (retCols.size() != 1) {
@@ -125,65 +130,57 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_fromArrow(JNIEnv *env,
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_stringConcatenation(JNIEnv *env, jclass,
-                                                                             jlongArray column_handles,
-                                                                             jlong separator,
-                                                                             jlong narep,
-                                                                             jboolean separate_nulls) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_stringConcatenation(
+    JNIEnv *env, jclass, jlongArray column_handles, jlong separator, jlong narep,
+    jboolean separate_nulls) {
   JNI_NULL_CHECK(env, column_handles, "array of column handles is null", 0);
   JNI_NULL_CHECK(env, separator, "separator string scalar object is null", 0);
   JNI_NULL_CHECK(env, narep, "narep string scalar object is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const auto& separator_scalar = *reinterpret_cast<cudf::string_scalar*>(separator);
-    const auto& narep_scalar     = *reinterpret_cast<cudf::string_scalar*>(narep);
-    auto null_policy = separate_nulls ? cudf::strings::separator_on_nulls::YES
-                                      : cudf::strings::separator_on_nulls::NO;
+    const auto &separator_scalar = *reinterpret_cast<cudf::string_scalar *>(separator);
+    const auto &narep_scalar = *reinterpret_cast<cudf::string_scalar *>(narep);
+    auto null_policy = separate_nulls ? cudf::strings::separator_on_nulls::YES :
+                                        cudf::strings::separator_on_nulls::NO;
 
     cudf::jni::native_jpointerArray<cudf::column_view> n_cudf_columns(env, column_handles);
     std::vector<cudf::column_view> column_views;
-    std::transform(n_cudf_columns.data(),
-                   n_cudf_columns.data() + n_cudf_columns.size(),
+    std::transform(n_cudf_columns.data(), n_cudf_columns.data() + n_cudf_columns.size(),
                    std::back_inserter(column_views),
                    [](auto const &p_column) { return *p_column; });
 
-    std::unique_ptr<cudf::column> result =
-      cudf::strings::concatenate(cudf::table_view(column_views), separator_scalar,
-                                 narep_scalar, null_policy);
+    std::unique_ptr<cudf::column> result = cudf::strings::concatenate(
+        cudf::table_view(column_views), separator_scalar, narep_scalar, null_policy);
     return reinterpret_cast<jlong>(result.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_stringConcatenationSepCol(JNIEnv *env, jclass,
-                                                                                   jlongArray column_handles,
-                                                                                   jlong sep_handle,
-                                                                                   jlong separator_narep,
-                                                                                   jlong col_narep,
-                                                                                   jboolean separate_nulls) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_stringConcatenationSepCol(
+    JNIEnv *env, jclass, jlongArray column_handles, jlong sep_handle, jlong separator_narep,
+    jlong col_narep, jboolean separate_nulls) {
   JNI_NULL_CHECK(env, column_handles, "array of column handles is null", 0);
   JNI_NULL_CHECK(env, sep_handle, "separator column handle is null", 0);
   JNI_NULL_CHECK(env, separator_narep, "separator narep string scalar object is null", 0);
   JNI_NULL_CHECK(env, col_narep, "column narep string scalar object is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const auto& separator_narep_scalar = *reinterpret_cast<cudf::string_scalar*>(separator_narep);
-    const auto& col_narep_scalar = *reinterpret_cast<cudf::string_scalar*>(col_narep);
-    auto null_policy = separate_nulls ? cudf::strings::separator_on_nulls::YES
-                                      : cudf::strings::separator_on_nulls::NO;
+    const auto &separator_narep_scalar = *reinterpret_cast<cudf::string_scalar *>(separator_narep);
+    const auto &col_narep_scalar = *reinterpret_cast<cudf::string_scalar *>(col_narep);
+    auto null_policy = separate_nulls ? cudf::strings::separator_on_nulls::YES :
+                                        cudf::strings::separator_on_nulls::NO;
 
     cudf::jni::native_jpointerArray<cudf::column_view> n_cudf_columns(env, column_handles);
     std::vector<cudf::column_view> column_views;
-    std::transform(n_cudf_columns.data(),
-                   n_cudf_columns.data() + n_cudf_columns.size(),
+    std::transform(n_cudf_columns.data(), n_cudf_columns.data() + n_cudf_columns.size(),
                    std::back_inserter(column_views),
                    [](auto const &p_column) { return *p_column; });
 
     cudf::column_view *column = reinterpret_cast<cudf::column_view *>(sep_handle);
     cudf::strings_column_view strings_column(*column);
     std::unique_ptr<cudf::column> result =
-      cudf::strings::concatenate(cudf::table_view(column_views), strings_column,
-                                 separator_narep_scalar, col_narep_scalar, null_policy);
+        cudf::strings::concatenate(cudf::table_view(column_views), strings_column,
+                                   separator_narep_scalar, col_narep_scalar, null_policy);
     return reinterpret_cast<jlong>(result.release());
   }
   CATCH_STD(env, 0);
@@ -195,28 +192,25 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_concatListByRow(JNIEnv
   JNI_NULL_CHECK(env, column_handles, "array of column handles is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto null_policy = ignore_null ? cudf::lists::concatenate_null_policy::IGNORE
-                                   : cudf::lists::concatenate_null_policy::NULLIFY_OUTPUT_ROW;
+    auto null_policy = ignore_null ? cudf::lists::concatenate_null_policy::IGNORE :
+                                     cudf::lists::concatenate_null_policy::NULLIFY_OUTPUT_ROW;
 
     cudf::jni::native_jpointerArray<cudf::column_view> n_cudf_columns(env, column_handles);
     std::vector<cudf::column_view> column_views;
-    std::transform(n_cudf_columns.data(),
-                   n_cudf_columns.data() + n_cudf_columns.size(),
+    std::transform(n_cudf_columns.data(), n_cudf_columns.data() + n_cudf_columns.size(),
                    std::back_inserter(column_views),
                    [](auto const &p_column) { return *p_column; });
 
     std::unique_ptr<cudf::column> result =
-      cudf::lists::concatenate_rows(cudf::table_view(column_views), null_policy);
+        cudf::lists::concatenate_rows(cudf::table_view(column_views), null_policy);
     return reinterpret_cast<jlong>(result.release());
   }
   CATCH_STD(env, 0);
 }
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_makeList(JNIEnv *env, jobject j_object,
-                                                                  jlongArray handles,
-                                                                  jlong j_type,
-                                                                  jint scale,
-                                                                  jlong row_count) {
+                                                                  jlongArray handles, jlong j_type,
+                                                                  jint scale, jlong row_count) {
   using ScalarType = cudf::scalar_type_t<cudf::size_type>;
   JNI_NULL_CHECK(env, handles, "native view handles are null", 0)
   try {
@@ -228,7 +222,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_makeList(JNIEnv *env, j
       children_vector[i] = *children[i];
     }
     auto zero = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32));
-    zero->set_valid(true);
+    zero->set_valid_async(true);
     static_cast<ScalarType *>(zero.get())->set_value(0);
 
     if (children.size() == 0) {
@@ -236,17 +230,17 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_makeList(JNIEnv *env, j
       auto offsets = cudf::make_column_from_scalar(*zero, row_count + 1);
       cudf::data_type n_data_type = cudf::jni::make_data_type(j_type, scale);
       auto empty_col = cudf::make_empty_column(n_data_type);
-      ret = cudf::make_lists_column(row_count, std::move(offsets), std::move(empty_col),
-              0, rmm::device_buffer());
+      ret = cudf::make_lists_column(row_count, std::move(offsets), std::move(empty_col), 0,
+                                    rmm::device_buffer());
     } else {
       auto count = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32));
-      count->set_valid(true);
+      count->set_valid_async(true);
       static_cast<ScalarType *>(count.get())->set_value(children.size());
 
       std::unique_ptr<cudf::column> offsets = cudf::sequence(row_count + 1, *zero, *count);
       auto data_col = cudf::interleave_columns(cudf::table_view(children_vector));
-      ret = cudf::make_lists_column(row_count, std::move(offsets), std::move(data_col),
-              0, rmm::device_buffer());
+      ret = cudf::make_lists_column(row_count, std::move(offsets), std::move(data_col), 0,
+                                    rmm::device_buffer());
     }
 
     return reinterpret_cast<jlong>(ret.release());
@@ -288,7 +282,6 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_fromScalar(JNIEnv *env,
   CATCH_STD(env, 0);
 }
 
-
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_concatenate(JNIEnv *env, jclass clazz,
                                                                      jlongArray column_handles) {
   JNI_NULL_CHECK(env, column_handles, "input columns are null", 0);
@@ -313,12 +306,10 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_concatenate(JNIEnv *env
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_hash(JNIEnv *env,
-                                                              jobject j_object,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_hash(JNIEnv *env, jobject j_object,
                                                               jlongArray column_handles,
                                                               jint hash_function_id,
-                                                              jintArray initial_values,
-                                                              jint seed) {
+                                                              jintArray initial_values, jint seed) {
   JNI_NULL_CHECK(env, column_handles, "array of column handles is null", 0);
   JNI_NULL_CHECK(env, initial_values, "array of initial values is null", 0);
 
@@ -330,13 +321,13 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_hash(JNIEnv *env,
                    [](auto const &p_column) { return *p_column; });
     cudf::table_view *input_table = new cudf::table_view(column_views);
 
-    cudf::jni::native_jintArray native_iv (env, initial_values);
+    cudf::jni::native_jintArray native_iv(env, initial_values);
     std::vector<uint32_t> vector_iv;
     std::transform(native_iv.data(), native_iv.data() + native_iv.size(),
-                   std::back_inserter(vector_iv),
-                   [](auto const &iv) { return iv; });
+                   std::back_inserter(vector_iv), [](auto const &iv) { return iv; });
 
-    std::unique_ptr<cudf::column> result = cudf::hash(*input_table, static_cast<cudf::hash_id>(hash_function_id), vector_iv, seed);
+    std::unique_ptr<cudf::column> result =
+        cudf::hash(*input_table, static_cast<cudf::hash_id>(hash_function_id), vector_iv, seed);
     return reinterpret_cast<jlong>(result.release());
   }
   CATCH_STD(env, 0);
@@ -386,8 +377,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_getNativeColumnView(JNI
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_makeEmptyCudfColumn(JNIEnv *env,
-                                                                             jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_makeEmptyCudfColumn(JNIEnv *env, jclass,
                                                                              jint j_type,
                                                                              jint scale) {
 
diff --git a/java/src/main/native/src/ScalarJni.cpp b/java/src/main/native/src/ScalarJni.cpp
index 8939c77f234..e0fad0a60c4 100644
--- a/java/src/main/native/src/ScalarJni.cpp
+++ b/java/src/main/native/src/ScalarJni.cpp
@@ -161,7 +161,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeBool8Scalar(JNIEnv *env,
     cudf::jni::auto_set_device(env);
     std::unique_ptr<cudf::scalar> s =
         cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::BOOL8));
-    s->set_valid(is_valid);
+    s->set_valid_async(is_valid);
     if (is_valid) {
       using ScalarType = cudf::scalar_type_t<int8_t>;
       int8_t val = value ? 1 : 0;
@@ -178,7 +178,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeInt8Scalar(JNIEnv *env, j
     cudf::jni::auto_set_device(env);
     std::unique_ptr<cudf::scalar> s =
         cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT8));
-    s->set_valid(is_valid);
+    s->set_valid_async(is_valid);
     if (is_valid) {
       using ScalarType = cudf::scalar_type_t<int8_t>;
       static_cast<ScalarType *>(s.get())->set_value(static_cast<int8_t>(value));
@@ -194,7 +194,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeUint8Scalar(JNIEnv *env,
     cudf::jni::auto_set_device(env);
     std::unique_ptr<cudf::scalar> s =
         cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::UINT8));
-    s->set_valid(is_valid);
+    s->set_valid_async(is_valid);
     if (is_valid) {
       using ScalarType = cudf::scalar_type_t<uint8_t>;
       static_cast<ScalarType *>(s.get())->set_value(static_cast<uint8_t>(value));
@@ -211,7 +211,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeInt16Scalar(JNIEnv *env,
     cudf::jni::auto_set_device(env);
     std::unique_ptr<cudf::scalar> s =
         cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT16));
-    s->set_valid(is_valid);
+    s->set_valid_async(is_valid);
     if (is_valid) {
       using ScalarType = cudf::scalar_type_t<int16_t>;
       static_cast<ScalarType *>(s.get())->set_value(static_cast<int16_t>(value));
@@ -228,7 +228,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeUint16Scalar(JNIEnv *env,
     cudf::jni::auto_set_device(env);
     std::unique_ptr<cudf::scalar> s =
         cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::UINT16));
-    s->set_valid(is_valid);
+    s->set_valid_async(is_valid);
     if (is_valid) {
       using ScalarType = cudf::scalar_type_t<uint16_t>;
       static_cast<ScalarType *>(s.get())->set_value(static_cast<uint16_t>(value));
@@ -245,7 +245,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeDurationDaysScalar(JNIEnv
     cudf::jni::auto_set_device(env);
     std::unique_ptr<cudf::scalar> s =
         cudf::make_duration_scalar(cudf::data_type(cudf::type_id::DURATION_DAYS));
-    s->set_valid(is_valid);
+    s->set_valid_async(is_valid);
     if (is_valid) {
       using ScalarType = cudf::scalar_type_t<int32_t>;
       static_cast<ScalarType *>(s.get())->set_value(static_cast<int32_t>(value));
@@ -261,7 +261,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeInt32Scalar(JNIEnv *env,
     cudf::jni::auto_set_device(env);
     std::unique_ptr<cudf::scalar> s =
         cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32));
-    s->set_valid(is_valid);
+    s->set_valid_async(is_valid);
     if (is_valid) {
       using ScalarType = cudf::scalar_type_t<int32_t>;
       static_cast<ScalarType *>(s.get())->set_value(static_cast<int32_t>(value));
@@ -277,7 +277,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeUint32Scalar(JNIEnv *env,
     cudf::jni::auto_set_device(env);
     std::unique_ptr<cudf::scalar> s =
         cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::UINT32));
-    s->set_valid(is_valid);
+    s->set_valid_async(is_valid);
     if (is_valid) {
       using ScalarType = cudf::scalar_type_t<uint32_t>;
       static_cast<ScalarType *>(s.get())->set_value(static_cast<uint32_t>(value));
@@ -293,7 +293,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeInt64Scalar(JNIEnv *env,
     cudf::jni::auto_set_device(env);
     std::unique_ptr<cudf::scalar> s =
         cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT64));
-    s->set_valid(is_valid);
+    s->set_valid_async(is_valid);
     if (is_valid) {
       using ScalarType = cudf::scalar_type_t<int64_t>;
       static_cast<ScalarType *>(s.get())->set_value(static_cast<int64_t>(value));
@@ -310,7 +310,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeUint64Scalar(JNIEnv *env,
     cudf::jni::auto_set_device(env);
     std::unique_ptr<cudf::scalar> s =
         cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::UINT64));
-    s->set_valid(is_valid);
+    s->set_valid_async(is_valid);
     if (is_valid) {
       using ScalarType = cudf::scalar_type_t<uint64_t>;
       static_cast<ScalarType *>(s.get())->set_value(static_cast<uint64_t>(value));
@@ -327,7 +327,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeFloat32Scalar(JNIEnv *env
     cudf::jni::auto_set_device(env);
     std::unique_ptr<cudf::scalar> s =
         cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::FLOAT32));
-    s->set_valid(is_valid);
+    s->set_valid_async(is_valid);
     if (is_valid) {
       using ScalarType = cudf::scalar_type_t<float>;
       static_cast<ScalarType *>(s.get())->set_value(static_cast<float>(value));
@@ -344,7 +344,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeFloat64Scalar(JNIEnv *env
     cudf::jni::auto_set_device(env);
     std::unique_ptr<cudf::scalar> s =
         cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::FLOAT64));
-    s->set_valid(is_valid);
+    s->set_valid_async(is_valid);
     if (is_valid) {
       using ScalarType = cudf::scalar_type_t<double>;
       static_cast<ScalarType *>(s.get())->set_value(static_cast<double>(value));
@@ -378,7 +378,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeTimestampDaysScalar(JNIEn
     cudf::jni::auto_set_device(env);
     std::unique_ptr<cudf::scalar> s =
         cudf::make_timestamp_scalar(cudf::data_type(cudf::type_id::TIMESTAMP_DAYS));
-    s->set_valid(is_valid);
+    s->set_valid_async(is_valid);
     if (is_valid) {
       using ScalarType = cudf::scalar_type_t<int32_t>;
       static_cast<ScalarType *>(s.get())->set_value(static_cast<int32_t>(value));
@@ -396,7 +396,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeDurationTimeScalar(JNIEnv
     cudf::jni::auto_set_device(env);
     auto dtype_id = static_cast<cudf::type_id>(jdtype_id);
     std::unique_ptr<cudf::scalar> s = cudf::make_duration_scalar(cudf::data_type(dtype_id));
-    s->set_valid(is_valid);
+    s->set_valid_async(is_valid);
     if (is_valid) {
       using ScalarType = cudf::scalar_type_t<int64_t>;
       static_cast<ScalarType *>(s.get())->set_value(static_cast<int64_t>(value));
@@ -414,7 +414,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeTimestampTimeScalar(JNIEn
     cudf::jni::auto_set_device(env);
     auto dtype_id = static_cast<cudf::type_id>(jdtype_id);
     std::unique_ptr<cudf::scalar> s = cudf::make_timestamp_scalar(cudf::data_type(dtype_id));
-    s->set_valid(is_valid);
+    s->set_valid_async(is_valid);
     if (is_valid) {
       using ScalarType = cudf::scalar_type_t<int64_t>;
       static_cast<ScalarType *>(s.get())->set_value(static_cast<int64_t>(value));
@@ -425,31 +425,30 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeTimestampTimeScalar(JNIEn
 }
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeDecimal32Scalar(JNIEnv *env, jclass,
-                                                                       jint value,
-                                                                       jint scale,
+                                                                       jint value, jint scale,
                                                                        jboolean is_valid) {
   try {
     cudf::jni::auto_set_device(env);
     auto const value_ = static_cast<int32_t>(value);
     auto const scale_ = numeric::scale_type{static_cast<int32_t>(scale)};
-    std::unique_ptr<cudf::scalar> s = cudf::make_fixed_point_scalar<numeric::decimal32>(value_, scale_);
-    s->set_valid(is_valid);
+    std::unique_ptr<cudf::scalar> s =
+        cudf::make_fixed_point_scalar<numeric::decimal32>(value_, scale_);
+    s->set_valid_async(is_valid);
     return reinterpret_cast<jlong>(s.release());
   }
   CATCH_STD(env, 0);
 }
 
-
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeDecimal64Scalar(JNIEnv *env, jclass,
-                                                                       jlong value,
-                                                                       jint scale,
+                                                                       jlong value, jint scale,
                                                                        jboolean is_valid) {
   try {
     cudf::jni::auto_set_device(env);
     auto const value_ = static_cast<int64_t>(value);
     auto const scale_ = numeric::scale_type{static_cast<int32_t>(scale)};
-    std::unique_ptr<cudf::scalar> s = cudf::make_fixed_point_scalar<numeric::decimal64>(value_, scale_);
-    s->set_valid(is_valid);
+    std::unique_ptr<cudf::scalar> s =
+        cudf::make_fixed_point_scalar<numeric::decimal64>(value_, scale_);
+    s->set_valid_async(is_valid);
     return reinterpret_cast<jlong>(s.release());
   }
   CATCH_STD(env, 0);
@@ -467,8 +466,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_binaryOpSV(JNIEnv *env, jclas
     cudf::data_type n_data_type = cudf::jni::make_data_type(out_dtype, scale);
 
     cudf::binary_operator op = static_cast<cudf::binary_operator>(int_op);
-    std::unique_ptr<cudf::column> result = cudf::binary_operation(
-        *lhs, *rhs, op, n_data_type);
+    std::unique_ptr<cudf::column> result = cudf::binary_operation(*lhs, *rhs, op, n_data_type);
     return reinterpret_cast<jlong>(result.release());
   }
   CATCH_STD(env, 0);
@@ -486,8 +484,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeListScalar(JNIEnv *env, j
     // is false, always passes the input view to the scalar, to avoid copying the column
     // twice.
     // Let the Java layer make sure the view is empty when `is_valid` is false.
-    cudf::scalar* s = new cudf::list_scalar(*col_view);
-    s->set_valid(is_valid);
+    cudf::scalar *s = new cudf::list_scalar(*col_view);
+    s->set_valid_async(is_valid);
     return reinterpret_cast<jlong>(s);
   }
   CATCH_STD(env, 0);
diff --git a/java/src/main/native/src/map_lookup.cu b/java/src/main/native/src/map_lookup.cu
index b20daf84fc1..ad791747713 100644
--- a/java/src/main/native/src/map_lookup.cu
+++ b/java/src/main/native/src/map_lookup.cu
@@ -163,14 +163,14 @@ std::unique_ptr<column> map_contains(column_view const &map_column, string_scala
   children.push_back(lcv.offsets());
   children.push_back(scv.child(0));
 
-  column_view list_of_keys(map_column.type(), map_column.size(),
-    nullptr, map_column.null_mask(), map_column.null_count(), 0, children);
-  auto contains_column  = lists::contains(list_of_keys, lookup_key);
+  column_view list_of_keys(map_column.type(), map_column.size(), nullptr, map_column.null_mask(),
+                           map_column.null_count(), 0, children);
+  auto contains_column = lists::contains(list_of_keys, lookup_key);
   // null will be skipped in all-aggregation when checking if all rows contain the key,
   // so replace all nulls with 0.
   std::unique_ptr<cudf::scalar> replacement =
-        cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::BOOL8));
-  replacement->set_valid(true);
+      cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::BOOL8));
+  replacement->set_valid_async(true);
   using ScalarType = cudf::scalar_type_t<int8_t>;
   static_cast<ScalarType *>(replacement.get())->set_value(0);
   auto result = cudf::replace_nulls(contains_column->view(), *replacement);
@@ -197,9 +197,9 @@ std::unique_ptr<column> map_lookup(column_view const &map_column, string_scalar
   auto values_column = structs_column.child(1);
   auto table_for_gather = table_view{std::vector<cudf::column_view>{values_column}};
 
-  auto gathered_table = cudf::detail::gather(
-      table_for_gather, gather_map->view(), out_of_bounds_policy::NULLIFY,
-      detail::negative_index_policy::NOT_ALLOWED, stream, mr);
+  auto gathered_table =
+      cudf::detail::gather(table_for_gather, gather_map->view(), out_of_bounds_policy::NULLIFY,
+                           detail::negative_index_policy::NOT_ALLOWED, stream, mr);
 
   return std::make_unique<cudf::column>(std::move(gathered_table->get_column(0)));
 }
diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu
index 402a592ef99..68f1ae93dec 100644
--- a/java/src/main/native/src/row_conversion.cu
+++ b/java/src/main/native/src/row_conversion.cu
@@ -494,11 +494,11 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
 
     using ScalarType = cudf::scalar_type_t<cudf::size_type>;
     auto zero = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value());
-    zero->set_valid(true, stream);
+    zero->set_valid_async(true, stream);
     static_cast<ScalarType *>(zero.get())->set_value(0, stream);
 
     auto step = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value());
-    step->set_valid(true, stream);
+    step->set_valid_async(true, stream);
     static_cast<ScalarType *>(step.get())
         ->set_value(static_cast<cudf::size_type>(size_per_row), stream);
 
diff --git a/java/src/test/java/ai/rapids/cudf/ScalarTest.java b/java/src/test/java/ai/rapids/cudf/ScalarTest.java
index 00de3a696ad..e317392196e 100644
--- a/java/src/test/java/ai/rapids/cudf/ScalarTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ScalarTest.java
@@ -18,23 +18,20 @@
 
 package ai.rapids.cudf;
 
+import static ai.rapids.cudf.TableTest.assertColumnsAreEqual;
+import static org.junit.jupiter.api.Assertions.*;
+
 import ai.rapids.cudf.HostColumnVector.BasicType;
 import ai.rapids.cudf.HostColumnVector.DataType;
 import ai.rapids.cudf.HostColumnVector.ListType;
 import ai.rapids.cudf.HostColumnVector.StructData;
 import ai.rapids.cudf.HostColumnVector.StructType;
-
-import org.junit.jupiter.api.Test;
-
 import java.math.BigDecimal;
 import java.nio.charset.StandardCharsets;
 import java.util.Arrays;
-
-import static ai.rapids.cudf.TableTest.assertColumnsAreEqual;
-import static org.junit.jupiter.api.Assertions.*;
+import org.junit.jupiter.api.Test;
 
 public class ScalarTest extends CudfTestBase {
-
   @Test
   public void testDoubleClose() {
     Scalar s = Scalar.fromNull(DType.INT32);
@@ -86,8 +83,7 @@ public void testNull() {
       }
 
       // test list scalar with elementType(`type`)
-      try (Scalar s = Scalar.listFromNull(hDataType);
-           ColumnView listCv = s.getListAsColumnView()) {
+      try (Scalar s = Scalar.listFromNull(hDataType); ColumnView listCv = s.getListAsColumnView()) {
         assertFalse(s.isValid(), "null validity for " + type);
         assertEquals(DType.LIST, s.getType());
         assertEquals(type, listCv.getType());
@@ -186,16 +182,21 @@ public void testDouble() {
 
   @Test
   public void testDecimal() {
-    BigDecimal[] bigDecimals = new BigDecimal[]{
+    BigDecimal[] bigDecimals = new BigDecimal[] {
         BigDecimal.valueOf(1234, 0),
         BigDecimal.valueOf(12345678, 2),
         BigDecimal.valueOf(1234567890123L, 6),
     };
-    for (BigDecimal dec: bigDecimals) {
+    for (BigDecimal dec : bigDecimals) {
       try (Scalar s = Scalar.fromDecimal(dec)) {
-        assertEquals(DType.fromJavaBigDecimal(dec), s.getType());
+        DType dtype = DType.fromJavaBigDecimal(dec);
+        assertEquals(dtype, s.getType());
         assertTrue(s.isValid());
-        assertEquals(dec.unscaledValue().longValueExact(), s.getLong());
+        if (dtype.getTypeId() == DType.DTypeEnum.DECIMAL64) {
+          assertEquals(dec.unscaledValue().longValueExact(), s.getLong());
+        } else {
+          assertEquals(dec.unscaledValue().intValueExact(), s.getInt());
+        }
         assertEquals(dec, s.getBigDecimal());
       }
       try (Scalar s = Scalar.fromDecimal(-dec.scale(), dec.unscaledValue().intValueExact())) {
@@ -260,7 +261,7 @@ public void testString() {
       assertEquals(DType.STRING, s.getType());
       assertTrue(s.isValid());
       assertEquals("TEST", s.getJavaString());
-      assertArrayEquals(new byte[]{'T', 'E', 'S', 'T'}, s.getUTF8());
+      assertArrayEquals(new byte[] {'T', 'E', 'S', 'T'}, s.getUTF8());
     }
   }
 
@@ -270,13 +271,13 @@ public void testUTF8String() {
       assertEquals(DType.STRING, s.getType());
       assertTrue(s.isValid());
       assertEquals("TEST", s.getJavaString());
-      assertArrayEquals(new byte[]{'T', 'E', 'S', 'T'}, s.getUTF8());
+      assertArrayEquals(new byte[] {'T', 'E', 'S', 'T'}, s.getUTF8());
     }
     try (Scalar s = Scalar.fromUTF8String("".getBytes(StandardCharsets.UTF_8))) {
       assertEquals(DType.STRING, s.getType());
       assertTrue(s.isValid());
       assertEquals("", s.getJavaString());
-      assertArrayEquals(new byte[]{}, s.getUTF8());
+      assertArrayEquals(new byte[] {}, s.getUTF8());
     }
   }
 
@@ -293,11 +294,10 @@ public void testList() {
     }
 
     // list of list
-    HostColumnVector.DataType listDT = new HostColumnVector.ListType(true,
-            new HostColumnVector.BasicType(true, DType.INT32));
-    try (ColumnVector listList = ColumnVector.fromLists(listDT,
-            Arrays.asList(1, 2, 3),
-            Arrays.asList(4, 5, 6));
+    HostColumnVector.DataType listDT =
+        new HostColumnVector.ListType(true, new HostColumnVector.BasicType(true, DType.INT32));
+    try (ColumnVector listList =
+             ColumnVector.fromLists(listDT, Arrays.asList(1, 2, 3), Arrays.asList(4, 5, 6));
          Scalar s = Scalar.listFromColumnView(listList)) {
       assertEquals(DType.LIST, s.getType());
       assertTrue(s.isValid());
@@ -377,22 +377,20 @@ public void testStruct() {
     }
 
     // test Struct Scalar with nested types
-    HostColumnVector.DataType listType = new HostColumnVector.ListType(true,
-        new HostColumnVector.BasicType(true, DType.INT32));
-    HostColumnVector.DataType structType = new HostColumnVector.StructType(true,
-        new HostColumnVector.BasicType(true, DType.INT32),
-        new HostColumnVector.BasicType(true, DType.INT64));
-    HostColumnVector.DataType nestedStructType = new HostColumnVector.StructType(true,
-        new HostColumnVector.BasicType(true, DType.STRING),
-        listType, structType);
+    HostColumnVector.DataType listType =
+        new HostColumnVector.ListType(true, new HostColumnVector.BasicType(true, DType.INT32));
+    HostColumnVector.DataType structType =
+        new HostColumnVector.StructType(true, new HostColumnVector.BasicType(true, DType.INT32),
+            new HostColumnVector.BasicType(true, DType.INT64));
+    HostColumnVector.DataType nestedStructType = new HostColumnVector.StructType(
+        true, new HostColumnVector.BasicType(true, DType.STRING), listType, structType);
     try (ColumnVector strCol = ColumnVector.fromStrings("AAAAAA");
          ColumnVector listCol = ColumnVector.fromLists(listType, Arrays.asList(1, 2, 3, 4, 5));
-         ColumnVector structCol = ColumnVector.fromStructs(structType,
-             new HostColumnVector.StructData(1, -1L));
+         ColumnVector structCol =
+             ColumnVector.fromStructs(structType, new HostColumnVector.StructData(1, -1L));
          ColumnVector nestedStructCol = ColumnVector.fromStructs(nestedStructType,
-             new HostColumnVector.StructData(null,
-                 Arrays.asList(1, 2, null),
-                 new HostColumnVector.StructData(null, 10L)));
+             new HostColumnVector.StructData(
+                 null, Arrays.asList(1, 2, null), new HostColumnVector.StructData(null, 10L)));
          Scalar s = Scalar.structFromColumnViews(strCol, listCol, structCol, nestedStructCol)) {
       assertEquals(DType.STRUCT, s.getType());
       assertTrue(s.isValid());
diff --git a/python/cudf/cudf/_lib/cpp/scalar/scalar.pxd b/python/cudf/cudf/_lib/cpp/scalar/scalar.pxd
index de5cb05447c..feb747a5ccd 100644
--- a/python/cudf/cudf/_lib/cpp/scalar/scalar.pxd
+++ b/python/cudf/cudf/_lib/cpp/scalar/scalar.pxd
@@ -17,7 +17,7 @@ cdef extern from "cudf/scalar/scalar.hpp" namespace "cudf" nogil:
         scalar() except +
         scalar(scalar other) except +
         data_type type() except +
-        void set_valid(bool is_valid) except +
+        void set_valid_async(bool is_valid) except +
         bool is_valid() except +
 
     cdef cppclass numeric_scalar[T](scalar):

From 90e29d9f64207a61966aba216af55c9a11d4c5fa Mon Sep 17 00:00:00 2001
From: Nghia Truong <ttnghia@users.noreply.github.com>
Date: Tue, 8 Jun 2021 05:49:12 -0600
Subject: [PATCH 14/15] Add move constructors for `string_scalar` and
 `struct_scalar` (#8428)

This PR adds some move constructors for `string_scalar` and `struct_scalar` that accept r-value references to existing buffers. By doing so, the input buffer is moved into the internal buffer of the newly constructed scalars instead of copying.

This also cleans up/rewrites doxygens for `scalar.hpp`.

Closes #8427.

Authors:
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Mike Wilson (https://github.com/hyperbolic2346)
  - MithunR (https://github.com/mythrocks)
  - David Wendt (https://github.com/davidwendt)
  - Mark Harris (https://github.com/harrism)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/8428
---
 cpp/include/cudf/scalar/scalar.hpp | 339 ++++++++++++++++-------------
 cpp/src/scalar/scalar.cpp          |  28 ++-
 2 files changed, 209 insertions(+), 158 deletions(-)

diff --git a/cpp/include/cudf/scalar/scalar.hpp b/cpp/include/cudf/scalar/scalar.hpp
index e4d7baf86d9..2e57e56255d 100644
--- a/cpp/include/cudf/scalar/scalar.hpp
+++ b/cpp/include/cudf/scalar/scalar.hpp
@@ -36,7 +36,7 @@ namespace cudf {
  */
 
 /**
- * @brief An owning class to represent a singular value
+ * @brief An owning class to represent a singular value.
  *
  * A scalar is a singular value of any of the supported datatypes in cudf.
  * Classes derived from this class are used to represent a scalar. Objects of
@@ -54,45 +54,46 @@ class scalar {
   /**
    * @brief Construct a new scalar object by deep copying another.
    *
-   * @param[in] other The scalar to copy.
-   * @param[in] stream CUDA stream used for device memory operations.
-   * @param[in] mr Device memory resource to use for device memory allocation
+   * @param other The scalar to copy.
+   * @param stream CUDA stream used for device memory operations.
+   * @param mr Device memory resource to use for device memory allocation.
    */
   scalar(scalar const& other,
          rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
          rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
-   * @brief Returns the scalar's logical value type
+   * @brief Returns the scalar's logical value type.
    */
   data_type type() const noexcept;
 
   /**
-   * @brief Updates the validity of the value
+   * @brief Updates the validity of the value.
    *
-   * @param is_valid true: set the value to valid. false: set it to null
+   * @param is_valid true: set the value to valid. false: set it to null.
    * @param stream CUDA stream used for device memory operations.
    */
   void set_valid_async(bool is_valid, rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
   /**
-   * @brief Indicates whether the scalar contains a valid value
+   * @brief Indicates whether the scalar contains a valid value.
    *
-   * @note Using the value when `is_valid() == false` is undefined behaviour
+   * @note Using the value when `is_valid() == false` is undefined behaviour. In addition, this
+   * function does a stream synchronization.
    *
    * @param stream CUDA stream used for device memory operations.
-   * @return true Value is valid
-   * @return false Value is invalid/null
+   * @return true Value is valid.
+   * @return false Value is invalid/null.
    */
   bool is_valid(rmm::cuda_stream_view stream = rmm::cuda_stream_default) const;
 
   /**
-   * @brief Returns a raw pointer to the validity bool in device memory
+   * @brief Returns a raw pointer to the validity bool in device memory.
    */
   bool* validity_data();
 
   /**
-   * @brief Returns a const raw pointer to the validity bool in device memory
+   * @brief Returns a const raw pointer to the validity bool in device memory.
    */
   bool const* validity_data() const;
 
@@ -103,15 +104,15 @@ class scalar {
   scalar() = delete;
 
   /**
-   * @brief Construct a new scalar object
+   * @brief Construct a new scalar object.
    *
    * @note Do not use this constructor directly. Instead, use a factory method
    * like make_numeric_scalar or make_string_scalar
    *
-   * @param[in] type Data type of the scalar
-   * @param[in] is_valid Whether the value held by the scalar is valid
-   * @param[in] stream CUDA stream used for device memory operations.
-   * @param[in] mr Device memory resource to use for device memory allocation
+   * @param type Data type of the scalar.
+   * @param is_valid Whether the value held by the scalar is valid.
+   * @param stream CUDA stream used for device memory operations.
+   * @param mr Device memory resource to use for device memory allocation.
    */
   scalar(data_type type,
          bool is_valid                       = false,
@@ -136,41 +137,41 @@ class fixed_width_scalar : public scalar {
   /**
    * @brief Construct a new fixed-width scalar object by deep copying another.
    *
-   * @param[in] other The scalar to copy.
-   * @param[in] stream CUDA stream used for device memory operations.
-   * @param[in] mr Device memory resource to use for device memory allocation
+   * @param other The scalar to copy.
+   * @param stream CUDA stream used for device memory operations.
+   * @param mr Device memory resource to use for device memory allocation.
    */
   fixed_width_scalar(fixed_width_scalar const& other,
                      rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
                      rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
-   * @brief Set the value of the scalar
+   * @brief Set the value of the scalar.
    *
-   * @param value New value of scalar
+   * @param value New value of scalar.
    * @param stream CUDA stream used for device memory operations.
    */
   void set_value(T value, rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
   /**
-   * @brief Implicit conversion operator to get the value of the scalar on the host
+   * @brief Implicit conversion operator to get the value of the scalar on the host.
    */
   explicit operator value_type() const;
 
   /**
-   * @brief Get the value of the scalar
+   * @brief Get the value of the scalar.
    *
    * @param stream CUDA stream used for device memory operations.
    */
   T value(rmm::cuda_stream_view stream = rmm::cuda_stream_default) const;
 
   /**
-   * @brief Returns a raw pointer to the value in device memory
+   * @brief Returns a raw pointer to the value in device memory.
    */
   T* data();
 
   /**
-   * @brief Returns a const raw pointer to the value in device memory
+   * @brief Returns a const raw pointer to the value in device memory.
    */
   T const* data() const;
 
@@ -180,12 +181,12 @@ class fixed_width_scalar : public scalar {
   fixed_width_scalar() = delete;
 
   /**
-   * @brief Construct a new fixed width scalar object
+   * @brief Construct a new fixed width scalar object.
    *
-   * @param[in] value The initial value of the scalar
-   * @param[in] is_valid Whether the value held by the scalar is valid
-   * @param[in] stream CUDA stream used for device memory operations.
-   * @param[in] mr Device memory resource to use for device memory allocation
+   * @param value The initial value of the scalar.
+   * @param is_valid Whether the value held by the scalar is valid.
+   * @param stream CUDA stream used for device memory operations.
+   * @param mr Device memory resource to use for device memory allocation.
    */
   fixed_width_scalar(T value,
                      bool is_valid                       = true,
@@ -195,10 +196,10 @@ class fixed_width_scalar : public scalar {
   /**
    * @brief Construct a new fixed width scalar object from existing device memory.
    *
-   * @param[in] data The scalar's data in device memory
-   * @param[in] is_valid Whether the value held by the scalar is valid
-   * @param[in] stream CUDA stream used for device memory operations.
-   * @param[in] mr Device memory resource to use for device memory allocation
+   * @param data The scalar's data in device memory.
+   * @param is_valid Whether the value held by the scalar is valid.
+   * @param stream CUDA stream used for device memory operations.
+   * @param mr Device memory resource to use for device memory allocation.
    */
   fixed_width_scalar(rmm::device_scalar<T>&& data,
                      bool is_valid                       = true,
@@ -209,9 +210,9 @@ class fixed_width_scalar : public scalar {
 }  // namespace detail
 
 /**
- * @brief An owning class to represent a numerical value in device memory
+ * @brief An owning class to represent a numerical value in device memory.
  *
- * @tparam T the data type of the numerical value
+ * @tparam T the data type of the numerical value.
  */
 template <typename T>
 class numeric_scalar : public detail::fixed_width_scalar<T> {
@@ -228,21 +229,21 @@ class numeric_scalar : public detail::fixed_width_scalar<T> {
   /**
    * @brief Construct a new numeric scalar object by deep copying another.
    *
-   * @param[in] other The scalar to copy.
-   * @param[in] stream CUDA stream used for device memory operations.
-   * @param[in] mr Device memory resource to use for device memory allocation
+   * @param other The scalar to copy.
+   * @param stream CUDA stream used for device memory operations.
+   * @param mr Device memory resource to use for device memory allocation.
    */
   numeric_scalar(numeric_scalar const& other,
                  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
                  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
-   * @brief Construct a new numeric scalar object
+   * @brief Construct a new numeric scalar object.
    *
-   * @param[in] value The initial value of the scalar
-   * @param[in] is_valid Whether the value held by the scalar is valid
-   * @param[in] stream CUDA stream used for device memory operations.
-   * @param[in] mr Device memory resource to use for device memory allocation
+   * @param value The initial value of the scalar.
+   * @param is_valid Whether the value held by the scalar is valid.
+   * @param stream CUDA stream used for device memory operations.
+   * @param mr Device memory resource to use for device memory allocation.
    */
   numeric_scalar(T value,
                  bool is_valid                       = true,
@@ -252,10 +253,10 @@ class numeric_scalar : public detail::fixed_width_scalar<T> {
   /**
    * @brief Construct a new numeric scalar object from existing device memory.
    *
-   * @param[in] data The scalar's data in device memory
-   * @param[in] is_valid Whether the value held by the scalar is valid
-   * @param[in] stream CUDA stream used for device memory operations.
-   * @param[in] mr Device memory resource to use for device memory allocation
+   * @param data The scalar's data in device memory.
+   * @param is_valid Whether the value held by the scalar is valid.
+   * @param stream CUDA stream used for device memory operations.
+   * @param mr Device memory resource to use for device memory allocation.
    */
   numeric_scalar(rmm::device_scalar<T>&& data,
                  bool is_valid                       = true,
@@ -264,9 +265,9 @@ class numeric_scalar : public detail::fixed_width_scalar<T> {
 };
 
 /**
- * @brief An owning class to represent a fixed_point number in device memory
+ * @brief An owning class to represent a fixed_point number in device memory.
  *
- * @tparam T the data type of the fixed_point number
+ * @tparam T the data type of the fixed_point number.
  */
 template <typename T>
 class fixed_point_scalar : public scalar {
@@ -286,22 +287,22 @@ class fixed_point_scalar : public scalar {
   /**
    * @brief Construct a new fixed_point scalar object by deep copying another.
    *
-   * @param[in] other The scalar to copy.
-   * @param[in] stream CUDA stream used for device memory operations.
-   * @param[in] mr Device memory resource to use for device memory allocation
+   * @param other The scalar to copy.
+   * @param stream CUDA stream used for device memory operations.
+   * @param mr Device memory resource to use for device memory allocation.
    */
   fixed_point_scalar(fixed_point_scalar const& other,
                      rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
                      rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
-   * @brief Construct a new fixed_point scalar object from already shifted value and scale
+   * @brief Construct a new fixed_point scalar object from already shifted value and scale.
    *
-   * @param[in] value The initial shifted value of the fixed_point scalar
-   * @param[in] scale The scale of the fixed_point scalar
-   * @param[in] is_valid Whether the value held by the scalar is valid
-   * @param[in] stream CUDA stream used for device memory operations.
-   * @param[in] mr Device memory resource to use for device memory allocation
+   * @param value The initial shifted value of the fixed_point scalar.
+   * @param scale The scale of the fixed_point scalar.
+   * @param is_valid Whether the value held by the scalar is valid.
+   * @param stream CUDA stream used for device memory operations.
+   * @param mr Device memory resource to use for device memory allocation.
    */
   fixed_point_scalar(rep_type value,
                      numeric::scale_type scale,
@@ -310,12 +311,12 @@ class fixed_point_scalar : public scalar {
                      rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
-   * @brief Construct a new fixed_point scalar object from a value and default 0-scale
+   * @brief Construct a new fixed_point scalar object from a value and default 0-scale.
    *
-   * @param[in] value The initial value of the fixed_point scalar
-   * @param[in] is_valid Whether the value held by the scalar is valid
-   * @param[in] stream CUDA stream used for device memory operations.
-   * @param[in] mr Device memory resource to use for device memory allocation
+   * @param value The initial value of the fixed_point scalar.
+   * @param is_valid Whether the value held by the scalar is valid.
+   * @param stream CUDA stream used for device memory operations.
+   * @param mr Device memory resource to use for device memory allocation.
    */
   fixed_point_scalar(rep_type value,
                      bool is_valid                       = true,
@@ -323,12 +324,12 @@ class fixed_point_scalar : public scalar {
                      rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
-   * @brief Construct a new fixed_point scalar object from a fixed_point number
+   * @brief Construct a new fixed_point scalar object from a fixed_point number.
    *
-   * @param[in] value The fixed_point number from which the fixed_point scalar will be initialized
-   * @param[in] is_valid Whether the value held by the scalar is valid
-   * @param[in] stream CUDA stream used for device memory operations.
-   * @param[in] mr Device memory resource to use for device memory allocation
+   * @param value The fixed_point number from which the fixed_point scalar will be initialized.
+   * @param is_valid Whether the value held by the scalar is valid.
+   * @param stream CUDA stream used for device memory operations.
+   * @param mr Device memory resource to use for device memory allocation.
    */
   fixed_point_scalar(T value,
                      bool is_valid                       = true,
@@ -338,11 +339,11 @@ class fixed_point_scalar : public scalar {
   /**
    * @brief Construct a new fixed_point scalar object from existing device memory.
    *
-   * @param[in] data      The scalar's data in device memory
-   * @param[in] scale     The scale of the fixed_point scalar
-   * @param[in] is_valid  Whether the value held by the scalar is valid
-   * @param[in] stream    CUDA stream used for device memory operations.
-   * @param[in] mr        Device memory resource to use for device memory allocation
+   * @param data The scalar's data in device memory.
+   * @param scale The scale of the fixed_point scalar.
+   * @param is_valid Whether the value held by the scalar is valid.
+   * @param stream CUDA stream used for device memory operations.
+   * @param mr Device memory resource to use for device memory allocation.
    */
   fixed_point_scalar(rmm::device_scalar<rep_type>&& data,
                      numeric::scale_type scale,
@@ -351,26 +352,26 @@ class fixed_point_scalar : public scalar {
                      rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
-   * @brief Get the value of the scalar
+   * @brief Get the value of the scalar.
    *
    * @param stream CUDA stream used for device memory operations.
    */
   rep_type value(rmm::cuda_stream_view stream = rmm::cuda_stream_default) const;
 
   /**
-   * @brief Get the decimal32 or decimal64
+   * @brief Get the decimal32 or decimal64.
    *
    * @param stream CUDA stream used for device memory operations.
    */
   T fixed_point_value(rmm::cuda_stream_view stream = rmm::cuda_stream_default) const;
 
   /**
-   * @brief Returns a raw pointer to the value in device memory
+   * @brief Returns a raw pointer to the value in device memory.
    */
   rep_type* data();
 
   /**
-   * @brief Returns a const raw pointer to the value in device memory
+   * @brief Returns a const raw pointer to the value in device memory.
    */
   rep_type const* data() const;
 
@@ -379,7 +380,7 @@ class fixed_point_scalar : public scalar {
 };
 
 /**
- * @brief An owning class to represent a string in device memory
+ * @brief An owning class to represent a string in device memory.
  */
 class string_scalar : public scalar {
  public:
@@ -396,21 +397,21 @@ class string_scalar : public scalar {
   /**
    * @brief Construct a new string scalar object by deep copying another string_scalar.
    *
-   * @param[in] other The other string_scalar to copy
-   * @param[in] stream CUDA stream used for device memory operations.
-   * @param[in] mr Device memory resource to use for device memory allocation
+   * @param other The other string_scalar to copy.
+   * @param stream CUDA stream used for device memory operations.
+   * @param mr Device memory resource to use for device memory allocation.
    */
   string_scalar(string_scalar const& other,
                 rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
                 rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
-   * @brief Construct a new string scalar object
+   * @brief Construct a new string scalar object.
    *
-   * @param[in] value The value of the string
-   * @param[in] is_valid Whether the value held by the scalar is valid
-   * @param[in] stream CUDA stream used for device memory operations.
-   * @param[in] mr Device memory resource to use for device memory allocation
+   * @param value The value of the string.
+   * @param is_valid Whether the value held by the scalar is valid.
+   * @param stream CUDA stream used for device memory operations.
+   * @param mr Device memory resource to use for device memory allocation.
    */
   string_scalar(std::string const& string,
                 bool is_valid                       = true,
@@ -418,13 +419,14 @@ class string_scalar : public scalar {
                 rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
-   * @brief Construct a new string scalar object from string_view
+   * @brief Construct a new string scalar object from string_view.
+   *
    * Note that this function copies the data pointed by string_view.
    *
-   * @param[in] source string_view pointing string value to copy
-   * @param[in] is_valid Whether the value held by the scalar is valid
-   * @param[in] stream CUDA stream used for device memory operations.
-   * @param[in] mr Device memory resource to use for device memory allocation
+   * @param source The string_view pointing the string value to copy.
+   * @param is_valid Whether the value held by the scalar is valid.
+   * @param stream CUDA stream used for device memory operations.
+   * @param mr Device memory resource to use for device memory allocation.
    */
   string_scalar(value_type const& source,
                 bool is_valid                       = true,
@@ -432,13 +434,14 @@ class string_scalar : public scalar {
                 rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
-   * @brief Construct a new string scalar object from string_view in device memory
+   * @brief Construct a new string scalar object from string_view in device memory.
+   *
    * Note that this function copies the data pointed by string_view.
    *
-   * @param[in] data device_scalar string_view pointing string value to copy
-   * @param[in] is_valid Whether the value held by the scalar is valid
-   * @param[in] stream CUDA stream used for device memory operations.
-   * @param[in] mr Device memory resource to use for device memory allocation
+   * @param data The device_scalar of string_view pointing to the string value to copy.
+   * @param is_valid Whether the value held by the scalar is valid.
+   * @param stream CUDA stream used for device memory operations.
+   * @param mr Device memory resource to use for device memory allocation.
    */
   string_scalar(rmm::device_scalar<value_type>& data,
                 bool is_valid                       = true,
@@ -446,31 +449,47 @@ class string_scalar : public scalar {
                 rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
-   * @brief Implicit conversion operator to get the value of the scalar in a host std::string
+   * @brief Construct a new string scalar object by moving an existing string data buffer.
+   *
+   * Note that this constructor moves the existing buffer into the internal data buffer;
+   * no copy is performed.
+   *
+   * @param data The existing buffer to take over.
+   * @param is_valid Whether the value held by the scalar is valid.
+   * @param stream CUDA stream used for device memory operations.
+   * @param mr Device memory resource to use for device memory allocation.
+   */
+  string_scalar(rmm::device_buffer&& data,
+                bool is_valid                       = true,
+                rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+                rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+  /**
+   * @brief Implicit conversion operator to get the value of the scalar in a host std::string.
    */
   explicit operator std::string() const;
 
   /**
-   * @brief Get the value of the scalar in a host std::string
+   * @brief Get the value of the scalar in a host std::string.
    *
    * @param stream CUDA stream used for device memory operations.
    */
   std::string to_string(rmm::cuda_stream_view stream = rmm::cuda_stream_default) const;
 
   /**
-   * @brief Get the value of the scalar as a string_view
+   * @brief Get the value of the scalar as a string_view.
    *
    * @param stream CUDA stream used for device memory operations.
    */
   value_type value(rmm::cuda_stream_view stream = rmm::cuda_stream_default) const;
 
   /**
-   * @brief Returns the size of the string in bytes
+   * @brief Returns the size of the string in bytes.
    */
   size_type size() const;
 
   /**
-   * @brief Returns a raw pointer to the string in device memory
+   * @brief Returns a raw pointer to the string in device memory.
    */
   const char* data() const;
 
@@ -479,10 +498,10 @@ class string_scalar : public scalar {
 };
 
 /**
- * @brief An owning class to represent a timestamp/duration value in device memory
+ * @brief An owning class to represent a timestamp/duration value in device memory.
  *
- * @tparam T the data type of the timestamp/duration value
- * @see cudf/wrappers/timestamps.hpp, cudf/wrappers/durations.hpp for a list of allowed types
+ * @tparam T the data type of the timestamp/duration value.
+ * @see cudf/wrappers/timestamps.hpp, cudf/wrappers/durations.hpp for a list of allowed types.
  */
 template <typename T>
 class chrono_scalar : public detail::fixed_width_scalar<T> {
@@ -499,21 +518,21 @@ class chrono_scalar : public detail::fixed_width_scalar<T> {
   /**
    * @brief Construct a new chrono scalar object by deep copying another.
    *
-   * @param[in] other The scalar to copy.
-   * @param[in] stream CUDA stream used for device memory operations.
-   * @param[in] mr Device memory resource to use for device memory allocation
+   * @param other The scalar to copy.
+   * @param stream CUDA stream used for device memory operations.
+   * @param mr Device memory resource to use for device memory allocation.
    */
   chrono_scalar(chrono_scalar const& other,
                 rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
                 rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
-   * @brief Construct a new chrono scalar object
+   * @brief Construct a new chrono scalar object.
    *
-   * @param[in] value The initial value of the scalar
-   * @param[in] is_valid Whether the value held by the scalar is valid
-   * @param[in] stream CUDA stream used for device memory operations.
-   * @param[in] mr Device memory resource to use for device memory allocation
+   * @param value The initial value of the scalar.
+   * @param is_valid Whether the value held by the scalar is valid.
+   * @param stream CUDA stream used for device memory operations.
+   * @param mr Device memory resource to use for device memory allocation.
    */
   chrono_scalar(T value,
                 bool is_valid                       = true,
@@ -523,10 +542,10 @@ class chrono_scalar : public detail::fixed_width_scalar<T> {
   /**
    * @brief Construct a new chrono scalar object from existing device memory.
    *
-   * @param[in] data The scalar's data in device memory
-   * @param[in] is_valid Whether the value held by the scalar is valid
-   * @param[in] stream CUDA stream used for device memory operations.
-   * @param[in] mr Device memory resource to use for device memory allocation
+   * @param data The scalar's data in device memory.
+   * @param is_valid Whether the value held by the scalar is valid.
+   * @param stream CUDA stream used for device memory operations.
+   * @param mr Device memory resource to use for device memory allocation.
    */
   chrono_scalar(rmm::device_scalar<T>&& data,
                 bool is_valid                       = true,
@@ -547,9 +566,9 @@ class timestamp_scalar : public chrono_scalar<T> {
   /**
    * @brief Construct a new timestamp scalar object by deep copying another.
    *
-   * @param[in] other The scalar to copy.
-   * @param[in] stream CUDA stream used for device memory operations.
-   * @param[in] mr Device memory resource to use for device memory allocation
+   * @param other The scalar to copy.
+   * @param stream CUDA stream used for device memory operations.
+   * @param mr Device memory resource to use for device memory allocation.
    */
   timestamp_scalar(timestamp_scalar const& other,
                    rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
@@ -559,11 +578,11 @@ class timestamp_scalar : public chrono_scalar<T> {
    * @brief Construct a new timestamp scalar object from a duration that is
    * convertible to T::duration
    *
-   * @param value Duration representing number of ticks since the UNIX epoch or
-   * another duration that is convertible to timestamps duration
-   * @param is_valid Whether the value held by the scalar is valid
+   * @param value Duration representing number of ticks since the UNIX epoch or another duration
+   *        that is convertible to timestamps duration.
+   * @param is_valid Whether the value held by the scalar is valid.
    * @param stream CUDA stream used for device memory operations.
-   * @param mr Device memory resource to use for device memory allocation
+   * @param mr Device memory resource to use for device memory allocation.
    */
   template <typename Duration2>
   timestamp_scalar(Duration2 const& value,
@@ -590,21 +609,21 @@ class duration_scalar : public chrono_scalar<T> {
   /**
    * @brief Construct a new duration scalar object by deep copying another.
    *
-   * @param[in] other The scalar to copy.
-   * @param[in] stream CUDA stream used for device memory operations.
-   * @param[in] mr Device memory resource to use for device memory allocation
+   * @param other The scalar to copy.
+   * @param stream CUDA stream used for device memory operations.
+   * @param mr Device memory resource to use for device memory allocation.
    */
   duration_scalar(duration_scalar const& other,
                   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
                   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
-   * @brief Construct a new duration scalar object from tick counts
+   * @brief Construct a new duration scalar object from tick counts.
    *
-   * @param value Integer representing number of ticks since the UNIX epoch
-   * @param is_valid Whether the value held by the scalar is valid
+   * @param value Integer representing number of ticks since the UNIX epoch.
+   * @param is_valid Whether the value held by the scalar is valid.
    * @param stream CUDA stream used for device memory operations.
-   * @param mr Device memory resource to use for device memory allocation
+   * @param mr Device memory resource to use for device memory allocation.
    */
   duration_scalar(rep_type value,
                   bool is_valid,
@@ -618,7 +637,7 @@ class duration_scalar : public chrono_scalar<T> {
 };
 
 /**
- * @brief An owning class to represent a list value in device memory
+ * @brief An owning class to represent a list value in device memory.
  */
 class list_scalar : public scalar {
  public:
@@ -632,23 +651,23 @@ class list_scalar : public scalar {
   /**
    * @brief Construct a new list scalar object by deep copying another.
    *
-   * @param[in] other The scalar to copy.
-   * @param[in] stream CUDA stream used for device memory operations.
-   * @param[in] mr Device memory resource to use for device memory allocation
+   * @param other The scalar to copy.
+   * @param stream CUDA stream used for device memory operations.
+   * @param mr Device memory resource to use for device memory allocation.
    */
   list_scalar(list_scalar const& other,
               rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
               rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
-   * @brief Construct a new list scalar object from column_view
+   * @brief Construct a new list scalar object from column_view.
    *
    * The input column_view is copied.
    *
    * @param data The column data to copy.
-   * @param is_valid Whether the value held by the scalar is valid
+   * @param is_valid Whether the value held by the scalar is valid.
    * @param stream CUDA stream used for device memory operations.
-   * @param mr Device memory resource to use for device memory allocation
+   * @param mr Device memory resource to use for device memory allocation.
    */
   list_scalar(cudf::column_view const& data,
               bool is_valid                       = true,
@@ -658,10 +677,10 @@ class list_scalar : public scalar {
   /**
    * @brief Construct a new list scalar object from existing column.
    *
-   * @param data The column to take ownership of
-   * @param is_valid Whether the value held by the scalar is valid
+   * @param data The column to take ownership of.
+   * @param is_valid Whether the value held by the scalar is valid.
    * @param stream CUDA stream used for device memory operations.
-   * @param mr Device memory resource to use for device memory allocation
+   * @param mr Device memory resource to use for device memory allocation.
    */
   list_scalar(cudf::column&& data,
               bool is_valid                       = true,
@@ -669,7 +688,7 @@ class list_scalar : public scalar {
               rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
-   * @brief Returns a non-owning, immutable view to underlying device data
+   * @brief Returns a non-owning, immutable view to underlying device data.
    */
   column_view view() const;
 
@@ -678,7 +697,7 @@ class list_scalar : public scalar {
 };
 
 /**
- * @brief An owning class to represent a struct value in device memory
+ * @brief An owning class to represent a struct value in device memory.
  */
 class struct_scalar : public scalar {
  public:
@@ -690,14 +709,14 @@ class struct_scalar : public scalar {
   struct_scalar& operator=(struct_scalar&& other) = delete;
 
   /**
-   * @brief Construct a new struct scalar object from table_view
+   * @brief Construct a new struct scalar object from table_view.
    *
    * The input table_view is deep-copied.
    *
    * @param data The table data to copy.
-   * @param is_valid Whether the value held by the scalar is valid
+   * @param is_valid Whether the value held by the scalar is valid.
    * @param stream CUDA stream used for device memory operations.
-   * @param mr Device memory resource to use for device memory allocation
+   * @param mr Device memory resource to use for device memory allocation.
    */
   struct_scalar(table_view const& data,
                 bool is_valid                       = true,
@@ -705,14 +724,14 @@ class struct_scalar : public scalar {
                 rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
-   * @brief Construct a new struct scalar object from a host_span of column_views
+   * @brief Construct a new struct scalar object from a host_span of column_views.
    *
    * The input column_views are deep-copied.
    *
    * @param data The column_views to copy.
-   * @param is_valid Whether the value held by the scalar is valid
+   * @param is_valid Whether the value held by the scalar is valid.
    * @param stream CUDA stream used for device memory operations.
-   * @param mr Device memory resource to use for device memory allocation
+   * @param mr Device memory resource to use for device memory allocation.
    */
   struct_scalar(host_span<column_view const> data,
                 bool is_valid                       = true,
@@ -720,7 +739,23 @@ class struct_scalar : public scalar {
                 rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
-   * @brief Returns a non-owning, immutable view to underlying device data
+   * @brief Construct a new struct scalar object from an existing table in device memory.
+   *
+   * Note that this constructor moves the existing table data into the internal table data;
+   * no copies are performed.
+   *
+   * @param data The existing table data to take over.
+   * @param is_valid Whether the value held by the scalar is valid.
+   * @param stream CUDA stream used for device memory operations.
+   * @param mr Device memory resource to use for device memory allocation.
+   */
+  struct_scalar(table&& data,
+                bool is_valid                       = true,
+                rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+                rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+  /**
+   * @brief Returns a non-owning, immutable view to underlying device data.
    */
   table_view view() const;
 
diff --git a/cpp/src/scalar/scalar.cpp b/cpp/src/scalar/scalar.cpp
index 653164161e8..546eb050a60 100644
--- a/cpp/src/scalar/scalar.cpp
+++ b/cpp/src/scalar/scalar.cpp
@@ -61,7 +61,8 @@ string_scalar::string_scalar(std::string const& string,
                              bool is_valid,
                              rmm::cuda_stream_view stream,
                              rmm::mr::device_memory_resource* mr)
-  : scalar(data_type(type_id::STRING), is_valid), _data(string.data(), string.size(), stream, mr)
+  : scalar(data_type(type_id::STRING), is_valid, stream, mr),
+    _data(string.data(), string.size(), stream, mr)
 {
 }
 
@@ -84,11 +85,19 @@ string_scalar::string_scalar(value_type const& source,
                              bool is_valid,
                              rmm::cuda_stream_view stream,
                              rmm::mr::device_memory_resource* mr)
-  : scalar(data_type(type_id::STRING), is_valid),
+  : scalar(data_type(type_id::STRING), is_valid, stream, mr),
     _data(source.data(), source.size_bytes(), stream, mr)
 {
 }
 
+string_scalar::string_scalar(rmm::device_buffer&& data,
+                             bool is_valid,
+                             rmm::cuda_stream_view stream,
+                             rmm::mr::device_memory_resource* mr)
+  : scalar(data_type(type_id::STRING), is_valid, stream, mr), _data(std::move(data))
+{
+}
+
 string_scalar::value_type string_scalar::value(rmm::cuda_stream_view stream) const
 {
   return value_type{data(), size()};
@@ -146,8 +155,7 @@ fixed_point_scalar<T>::fixed_point_scalar(rmm::device_scalar<rep_type>&& data,
                                           bool is_valid,
                                           rmm::cuda_stream_view stream,
                                           rmm::mr::device_memory_resource* mr)
-  : scalar{data_type{type_to_id<T>(), scale}, is_valid, stream, mr},
-    _data{std::forward<rmm::device_scalar<rep_type>>(data)}
+  : scalar{data_type{type_to_id<T>(), scale}, is_valid, stream, mr}, _data{std::move(data)}
 {
 }
 
@@ -212,8 +220,7 @@ fixed_width_scalar<T>::fixed_width_scalar(rmm::device_scalar<T>&& data,
                                           bool is_valid,
                                           rmm::cuda_stream_view stream,
                                           rmm::mr::device_memory_resource* mr)
-  : scalar(data_type(type_to_id<T>()), is_valid, stream, mr),
-    _data{std::forward<rmm::device_scalar<T>>(data)}
+  : scalar(data_type(type_to_id<T>()), is_valid, stream, mr), _data{std::move(data)}
 {
 }
 
@@ -526,6 +533,15 @@ struct_scalar::struct_scalar(host_span<column_view const> data,
   init(is_valid, stream, mr);
 }
 
+struct_scalar::struct_scalar(table&& data,
+                             bool is_valid,
+                             rmm::cuda_stream_view stream,
+                             rmm::mr::device_memory_resource* mr)
+  : scalar(data_type(type_id::STRUCT), is_valid, stream, mr), _data(std::move(data))
+{
+  init(is_valid, stream, mr);
+}
+
 table_view struct_scalar::view() const { return _data.view(); }
 
 void struct_scalar::init(bool is_valid,

From a9f15b8cab5c31813e5cf60720c55ac0f9b64601 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Wed, 9 Jun 2021 08:21:26 -0400
Subject: [PATCH 15/15] Install only the same Thrust files that Thrust itself
 installs (#8420)

Fixes #8397

Recreate thrust's install rules so we don't install extra directories such as `.git`. We can't leverage thrust's install rules as that would break our layout as the cmake files would need to go into something like `lib/cmake/cudf/thirdparty?/lib/cmake/thrust`. This weird path is required as cudf packages a modified version of Thrust and we don't want that to be installed into a default system path.

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/cudf/pull/8420
---
 cpp/CMakeLists.txt                        | 10 ----------
 cpp/cmake/thirdparty/CUDF_GetThrust.cmake | 18 +++++++++++++++++-
 2 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 4b2e81edb9d..abfaeba86c3 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -598,16 +598,6 @@ install(DIRECTORY
             ${CUDF_GENERATED_INCLUDE_DIR}/include/libcudacxx
         DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/libcudf)
 
-install(DIRECTORY ${Thrust_SOURCE_DIR}/
-  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/libcudf/Thrust
-  PATTERN "*.py" EXCLUDE
-  PATTERN "benchmark" EXCLUDE
-  PATTERN "build" EXCLUDE
-  PATTERN "doc" EXCLUDE
-  PATTERN "examples" EXCLUDE
-  PATTERN "test" EXCLUDE
-  PATTERN "testing" EXCLUDE)
-
 include(CMakePackageConfigHelpers)
 
 configure_package_config_file(cmake/cudf-config.cmake.in "${CUDF_BINARY_DIR}/cmake/cudf-config.cmake"
diff --git a/cpp/cmake/thirdparty/CUDF_GetThrust.cmake b/cpp/cmake/thirdparty/CUDF_GetThrust.cmake
index 343ade8664d..2792786f553 100644
--- a/cpp/cmake/thirdparty/CUDF_GetThrust.cmake
+++ b/cpp/cmake/thirdparty/CUDF_GetThrust.cmake
@@ -35,7 +35,23 @@ function(find_and_configure_thrust VERSION)
 
     thrust_create_target(cudf::Thrust FROM_OPTIONS)
     set(THRUST_LIBRARY "cudf::Thrust" PARENT_SCOPE)
-    set(Thrust_SOURCE_DIR "${Thrust_SOURCE_DIR}" PARENT_SCOPE)
+
+    include(GNUInstallDirs)
+    install(DIRECTORY "${Thrust_SOURCE_DIR}/thrust"
+        DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/libcudf/Thrust/"
+        FILES_MATCHING
+            PATTERN "*.h"
+            PATTERN "*.inl")
+    install(DIRECTORY "${Thrust_SOURCE_DIR}/dependencies/cub/cub"
+        DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/libcudf/Thrust/dependencies/"
+        FILES_MATCHING
+            PATTERN "*.cuh")
+
+    install(DIRECTORY "${Thrust_SOURCE_DIR}/thrust/cmake"
+        DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/libcudf/Thrust/thrust/")
+    install(DIRECTORY "${Thrust_SOURCE_DIR}/dependencies/cub/cub/cmake"
+        DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/libcudf/Thrust/dependencies/cub/")
+
 endfunction()
 
 set(CUDF_MIN_VERSION_Thrust 1.12.0)