From dbd2b08d6793b7f9c5b1c1901f0688c4e80e86bd Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Wed, 25 May 2022 07:35:06 -0400
Subject: [PATCH] Update `groupby::hash` to use new row operators for keys
 (#10770)

Related to #8039 and #10181

Contributes to #10186

This PR updates `groupby::hash` to use new row operators. It gets rid of the current "flattened nested column" logic and allows `groupby::hash` to handle `LIST` and `STRUCT` keys. The work also involves small cleanups like getting rid of unnecessary template parameters and removing unused arguments.

It becomes a breaking PR since the updated `groupby::hash` will treat inner nulls as equal when top-level nulls are excluded
 while the current behavior treats inner nulls as **unequal**.

Authors:
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - Jake Hemstad (https://github.com/jrhemstad)
  - Nghia Truong (https://github.com/ttnghia)
  - Devavret Makkar (https://github.com/devavret)

URL: https://github.com/rapidsai/cudf/pull/10770
---
 cpp/benchmarks/CMakeLists.txt                 |   4 +-
 cpp/benchmarks/groupby/group_struct_keys.cpp  | 101 +++++++++
 ...roup_struct.cu => group_struct_values.cpp} |   0
 cpp/include/cudf/detail/groupby.hpp           |   5 +-
 cpp/src/groupby/groupby.cu                    |  14 +-
 cpp/src/groupby/hash/groupby.cu               | 126 +++++------
 cpp/src/groupby/hash/groupby_kernels.cuh      |   4 -
 cpp/tests/CMakeLists.txt                      |   2 +-
 cpp/tests/groupby/keys_tests.cpp              |  78 ++++++-
 cpp/tests/groupby/lists_tests.cpp             |  69 ------
 cpp/tests/groupby/lists_tests.cu              | 214 ++++++++++++++++++
 11 files changed, 459 insertions(+), 158 deletions(-)
 create mode 100644 cpp/benchmarks/groupby/group_struct_keys.cpp
 rename cpp/benchmarks/groupby/{group_struct.cu => group_struct_values.cpp} (100%)
 delete mode 100644 cpp/tests/groupby/lists_tests.cpp
 create mode 100644 cpp/tests/groupby/lists_tests.cu
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 04dcf51dd40..cb4ead20d00 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -198,13 +198,13 @@ ConfigureBench(
   groupby/group_sum.cu
   groupby/group_nth.cu
   groupby/group_shift.cu
-  groupby/group_struct.cu
+  groupby/group_struct_values.cpp
   groupby/group_no_requests.cu
   groupby/group_scan.cu
   groupby/group_rank_benchmark.cu
 )
 
-ConfigureNVBench(GROUPBY_NVBENCH groupby/group_rank_benchmark.cu)
+ConfigureNVBench(GROUPBY_NVBENCH groupby/group_rank_benchmark.cu groupby/group_struct_keys.cpp)
 
 # ##################################################################################################
 # * hashing benchmark -----------------------------------------------------------------------------
diff --git a/cpp/benchmarks/groupby/group_struct_keys.cpp b/cpp/benchmarks/groupby/group_struct_keys.cpp
new file mode 100644
index 00000000000..8398125db21
--- /dev/null
+++ b/cpp/benchmarks/groupby/group_struct_keys.cpp
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/rmm_pool_raii.hpp>
+
+#include <cudf_test/column_wrapper.hpp>
+
+#include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/groupby.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+#include <random>
+
+void bench_groupby_struct_keys(nvbench::state& state)
+{
+  cudf::rmm_pool_raii pool_raii;
+
+  using Type           = int;
+  using column_wrapper = cudf::test::fixed_width_column_wrapper<Type>;
+  std::default_random_engine generator;
+  std::uniform_int_distribution<int> distribution(0, 100);
+
+  const cudf::size_type n_rows{static_cast<cudf::size_type>(state.get_int64("NumRows"))};
+  const cudf::size_type n_cols{1};
+  const cudf::size_type depth{static_cast<cudf::size_type>(state.get_int64("Depth"))};
+  const bool nulls{static_cast<bool>(state.get_int64("Nulls"))};
+
+  // Create columns with values in the range [0,100)
+  std::vector<column_wrapper> columns;
+  columns.reserve(n_cols);
+  std::generate_n(std::back_inserter(columns), n_cols, [&]() {
+    auto const elements = cudf::detail::make_counting_transform_iterator(
+      0, [&](auto row) { return distribution(generator); });
+    if (!nulls) return column_wrapper(elements, elements + n_rows);
+    auto valids =
+      cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 10 != 0; });
+    return column_wrapper(elements, elements + n_rows, valids);
+  });
+
+  std::vector<std::unique_ptr<cudf::column>> cols;
+  std::transform(columns.begin(), columns.end(), std::back_inserter(cols), [](column_wrapper& col) {
+    return col.release();
+  });
+
+  std::vector<std::unique_ptr<cudf::column>> child_cols = std::move(cols);
+  // Add some layers
+  for (int i = 0; i < depth; i++) {
+    std::vector<bool> struct_validity;
+    std::uniform_int_distribution<int> bool_distribution(0, 100 * (i + 1));
+    std::generate_n(
+      std::back_inserter(struct_validity), n_rows, [&]() { return bool_distribution(generator); });
+    cudf::test::structs_column_wrapper struct_col(std::move(child_cols), struct_validity);
+    child_cols = std::vector<std::unique_ptr<cudf::column>>{};
+    child_cols.push_back(struct_col.release());
+  }
+
+  data_profile profile;
+  profile.set_null_frequency(std::nullopt);
+  profile.set_cardinality(0);
+  profile.set_distribution_params<int64_t>(
+    cudf::type_to_id<int64_t>(), distribution_id::UNIFORM, 0, 100);
+
+  auto const keys_table = cudf::table(std::move(child_cols));
+  auto const vals_table =
+    create_random_table({cudf::type_to_id<int64_t>()}, row_count{n_rows}, profile);
+
+  cudf::groupby::groupby gb_obj(keys_table.view());
+
+  std::vector<cudf::groupby::aggregation_request> requests;
+  requests.emplace_back(cudf::groupby::aggregation_request());
+  requests[0].values = vals_table->get_column(0).view();
+  requests[0].aggregations.push_back(cudf::make_min_aggregation<cudf::groupby_aggregation>());
+
+  // Set up nvbench default stream
+  auto stream = rmm::cuda_stream_default;
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+
+  state.exec(nvbench::exec_tag::sync,
+             [&](nvbench::launch& launch) { auto const result = gb_obj.aggregate(requests); });
+}
+
+NVBENCH_BENCH(bench_groupby_struct_keys)
+  .set_name("groupby_struct_keys")
+  .add_int64_power_of_two_axis("NumRows", {10, 16, 20})
+  .add_int64_axis("Depth", {0, 1, 8})
+  .add_int64_axis("Nulls", {0, 1});
diff --git a/cpp/benchmarks/groupby/group_struct.cu b/cpp/benchmarks/groupby/group_struct_values.cpp
similarity index 100%
rename from cpp/benchmarks/groupby/group_struct.cu
rename to cpp/benchmarks/groupby/group_struct_values.cpp
diff --git a/cpp/include/cudf/detail/groupby.hpp b/cpp/include/cudf/detail/groupby.hpp
index 36a76c7b6de..0037a01b496 100644
--- a/cpp/include/cudf/detail/groupby.hpp
+++ b/cpp/include/cudf/detail/groupby.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -31,13 +31,12 @@ namespace hash {
  * @brief Indicates if a set of aggregation requests can be satisfied with a
  * hash-based groupby implementation.
  *
- * @param keys The table of keys
  * @param requests The set of columns to aggregate and the aggregations to
  * perform
  * @return true A hash-based groupby can be used
  * @return false A hash-based groupby cannot be used
  */
-bool can_use_hash_groupby(table_view const& keys, host_span<aggregation_request const> requests);
+bool can_use_hash_groupby(host_span<aggregation_request const> requests);
 
 // Hash-based groupby
 std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby(
diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu
index a002b0bb744..e25512f80c5 100644
--- a/cpp/src/groupby/groupby.cu
+++ b/cpp/src/groupby/groupby.cu
@@ -65,8 +65,6 @@ std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::disp
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr)
 {
-  using namespace cudf::structs::detail;
-
   // If sort groupby has been called once on this groupby object, then
   // always use sort groupby from now on. Because once keys are sorted,
   // all the aggs that can be done by hash groupby are efficiently done by
@@ -74,16 +72,8 @@ std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::disp
   // Only use hash groupby if the keys aren't sorted and all requests can be
   // satisfied with a hash implementation
   if (_keys_are_sorted == sorted::NO and not _helper and
-      detail::hash::can_use_hash_groupby(_keys, requests)) {
-    // Optionally flatten nested key columns.
-    auto flattened             = flatten_nested_columns(_keys, {}, {}, column_nullability::FORCE);
-    auto flattened_keys        = flattened.flattened_columns();
-    auto is_supported_key_type = [](auto col) { return cudf::is_equality_comparable(col.type()); };
-    CUDF_EXPECTS(std::all_of(flattened_keys.begin(), flattened_keys.end(), is_supported_key_type),
-                 "Unsupported groupby key type does not support equality comparison");
-    auto [grouped_keys, results] =
-      detail::hash::groupby(flattened_keys, requests, _include_null_keys, stream, mr);
-    return std::pair(unflatten_nested_columns(std::move(grouped_keys), _keys), std::move(results));
+      detail::hash::can_use_hash_groupby(requests)) {
+    return detail::hash::groupby(_keys, requests, _include_null_keys, stream, mr);
   } else {
     return sort_aggregate(requests, stream, mr);
   }
diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index e22b3a4f3a4..ab8d0089347 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -37,7 +37,7 @@
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/groupby.hpp>
 #include <cudf/scalar/scalar.hpp>
-#include <cudf/table/row_operators.cuh>
+#include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/table/table_view.hpp>
@@ -65,6 +65,15 @@ namespace detail {
 namespace hash {
 namespace {
 
+// TODO: replace it with `cuco::static_map`
+// https://github.com/rapidsai/cudf/issues/10401
+using map_type = concurrent_unordered_map<
+  cudf::size_type,
+  cudf::size_type,
+  cudf::experimental::row::hash::device_row_hasher<cudf::detail::default_hash,
+                                                   cudf::nullate::DYNAMIC>,
+  cudf::experimental::row::equality::device_row_comparator<cudf::nullate::DYNAMIC>>;
+
 /**
  * @brief List of aggregation operations that can be computed with a hash-based
  * implementation.
@@ -179,14 +188,13 @@ class groupby_simple_aggregations_collector final
   }
 };
 
-template <typename Map>
 class hash_compound_agg_finalizer final : public cudf::detail::aggregation_finalizer {
   column_view col;
   data_type result_type;
   cudf::detail::result_cache* sparse_results;
   cudf::detail::result_cache* dense_results;
   device_span<size_type const> gather_map;
-  Map const& map;
+  map_type const& map;
   bitmask_type const* __restrict__ row_bitmask;
   rmm::cuda_stream_view stream;
   rmm::mr::device_memory_resource* mr;
@@ -198,7 +206,7 @@ class hash_compound_agg_finalizer final : public cudf::detail::aggregation_final
                               cudf::detail::result_cache* sparse_results,
                               cudf::detail::result_cache* dense_results,
                               device_span<size_type const> gather_map,
-                              Map const& map,
+                              map_type const& map,
                               bitmask_type const* row_bitmask,
                               rmm::cuda_stream_view stream,
                               rmm::mr::device_memory_resource* mr)
@@ -327,7 +335,7 @@ class hash_compound_agg_finalizer final : public cudf::detail::aggregation_final
       rmm::exec_policy(stream),
       thrust::make_counting_iterator(0),
       col.size(),
-      ::cudf::detail::var_hash_functor<Map>{
+      ::cudf::detail::var_hash_functor<map_type>{
         map, row_bitmask, *var_result_view, *values_view, *sum_view, *count_view, agg._ddof});
     sparse_results->add_result(col, agg, std::move(var_result));
     dense_results->add_result(col, agg, to_dense_agg_result(agg));
@@ -385,14 +393,12 @@ flatten_single_pass_aggs(host_span<aggregation_request const> requests)
  *
  * @see groupby_null_templated()
  */
-template <typename Map>
 void sparse_to_dense_results(table_view const& keys,
                              host_span<aggregation_request const> requests,
                              cudf::detail::result_cache* sparse_results,
                              cudf::detail::result_cache* dense_results,
                              device_span<size_type const> gather_map,
-                             // size_type map_size,
-                             Map const& map,
+                             map_type const& map,
                              bool keys_have_nulls,
                              null_policy include_null_keys,
                              rmm::cuda_stream_view stream,
@@ -409,7 +415,7 @@ void sparse_to_dense_results(table_view const& keys,
 
     // Given an aggregation, this will get the result from sparse_results and
     // convert and return dense, compacted result
-    auto finalizer = hash_compound_agg_finalizer<Map>(
+    auto finalizer = hash_compound_agg_finalizer(
       col, sparse_results, dense_results, gather_map, map, row_bitmask_ptr, stream, mr);
     for (auto&& agg : agg_v) {
       agg->finalize(finalizer);
@@ -417,43 +423,6 @@ void sparse_to_dense_results(table_view const& keys,
   }
 }
 
-/**
- * @brief Construct hash map that uses row comparator and row hasher on
- * `d_keys` table and stores indices
- */
-auto create_hash_map(table_device_view const& d_keys,
-                     bool keys_have_nulls,
-                     null_policy include_null_keys,
-                     rmm::cuda_stream_view stream)
-{
-  size_type constexpr unused_key{std::numeric_limits<size_type>::max()};
-  size_type constexpr unused_value{std::numeric_limits<size_type>::max()};
-
-  using map_type =
-    concurrent_unordered_map<size_type,
-                             size_type,
-                             row_hasher<cudf::detail::default_hash, nullate::DYNAMIC>,
-                             row_equality_comparator<nullate::DYNAMIC>>;
-
-  using allocator_type = typename map_type::allocator_type;
-
-  auto const null_keys_are_equal =
-    include_null_keys == null_policy::INCLUDE ? null_equality::EQUAL : null_equality::UNEQUAL;
-
-  row_hasher<cudf::detail::default_hash, nullate::DYNAMIC> hasher{nullate::DYNAMIC{keys_have_nulls},
-                                                                  d_keys};
-  row_equality_comparator rows_equal{
-    nullate::DYNAMIC{keys_have_nulls}, d_keys, d_keys, null_keys_are_equal};
-
-  return map_type::create(compute_hash_table_size(d_keys.num_rows()),
-                          stream,
-                          unused_key,
-                          unused_value,
-                          hasher,
-                          rows_equal,
-                          allocator_type());
-}
-
 // make table that will hold sparse results
 auto create_sparse_results_table(table_view const& flattened_values,
                                  std::vector<aggregation::Kind> aggs,
@@ -491,11 +460,10 @@ auto create_sparse_results_table(table_view const& flattened_values,
  * @brief Computes all aggregations from `requests` that require a single pass
  * over the data and stores the results in `sparse_results`
  */
-template <typename Map>
 void compute_single_pass_aggs(table_view const& keys,
                               host_span<aggregation_request const> requests,
                               cudf::detail::result_cache* sparse_results,
-                              Map& map,
+                              map_type& map,
                               bool keys_have_nulls,
                               null_policy include_null_keys,
                               rmm::cuda_stream_view stream)
@@ -509,22 +477,22 @@ void compute_single_pass_aggs(table_view const& keys,
   auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream);
   auto d_values       = table_device_view::create(flattened_values, stream);
   auto const d_aggs   = cudf::detail::make_device_uvector_async(agg_kinds, stream);
-
-  bool skip_key_rows_with_nulls = keys_have_nulls and include_null_keys == null_policy::EXCLUDE;
+  auto const skip_key_rows_with_nulls =
+    keys_have_nulls and include_null_keys == null_policy::EXCLUDE;
 
   auto row_bitmask =
     skip_key_rows_with_nulls ? cudf::detail::bitmask_and(keys, stream).first : rmm::device_buffer{};
+
   thrust::for_each_n(
     rmm::exec_policy(stream),
     thrust::make_counting_iterator(0),
     keys.num_rows(),
-    hash::compute_single_pass_aggs_fn<Map>{map,
-                                           keys.num_rows(),
-                                           *d_values,
-                                           *d_sparse_table,
-                                           d_aggs.data(),
-                                           static_cast<bitmask_type*>(row_bitmask.data()),
-                                           skip_key_rows_with_nulls});
+    hash::compute_single_pass_aggs_fn<map_type>{map,
+                                                *d_values,
+                                                *d_sparse_table,
+                                                d_aggs.data(),
+                                                static_cast<bitmask_type*>(row_bitmask.data()),
+                                                skip_key_rows_with_nulls});
   // Add results back to sparse_results cache
   auto sparse_result_cols = sparse_table.release();
   for (size_t i = 0; i < aggs.size(); i++) {
@@ -538,8 +506,7 @@ void compute_single_pass_aggs(table_view const& keys,
  * @brief Computes and returns a device vector containing all populated keys in
  * `map`.
  */
-template <typename Map>
-rmm::device_uvector<size_type> extract_populated_keys(Map map,
+rmm::device_uvector<size_type> extract_populated_keys(map_type const& map,
                                                       size_type num_keys,
                                                       rmm::cuda_stream_view stream)
 {
@@ -589,13 +556,33 @@ rmm::device_uvector<size_type> extract_populated_keys(Map map,
 std::unique_ptr<table> groupby(table_view const& keys,
                                host_span<aggregation_request const> requests,
                                cudf::detail::result_cache* cache,
-                               bool keys_have_nulls,
-                               null_policy include_null_keys,
+                               bool const keys_have_nulls,
+                               null_policy const include_null_keys,
                                rmm::cuda_stream_view stream,
                                rmm::mr::device_memory_resource* mr)
 {
-  auto d_keys_ptr = table_device_view::create(keys, stream);
-  auto map        = create_hash_map(*d_keys_ptr, keys_have_nulls, include_null_keys, stream);
+  auto const num_keys            = keys.num_rows();
+  auto const null_keys_are_equal = null_equality::EQUAL;
+  auto const has_null            = nullate::DYNAMIC{cudf::has_nested_nulls(keys)};
+
+  auto preprocessed_keys = cudf::experimental::row::hash::preprocessed_table::create(keys, stream);
+  auto const comparator  = cudf::experimental::row::equality::self_comparator{preprocessed_keys};
+  auto const row_hash    = cudf::experimental::row::hash::row_hasher{std::move(preprocessed_keys)};
+  auto const d_key_equal = comparator.device_comparator(has_null, null_keys_are_equal);
+  auto const d_row_hash  = row_hash.device_hasher(has_null);
+
+  size_type constexpr unused_key{std::numeric_limits<size_type>::max()};
+  size_type constexpr unused_value{std::numeric_limits<size_type>::max()};
+
+  using allocator_type = typename map_type::allocator_type;
+
+  auto map = map_type::create(compute_hash_table_size(num_keys),
+                              stream,
+                              unused_key,
+                              unused_value,
+                              d_row_hash,
+                              d_key_equal,
+                              allocator_type());
 
   // Cache of sparse results where the location of aggregate value in each
   // column is indexed by the hash map
@@ -635,13 +622,12 @@ std::unique_ptr<table> groupby(table_view const& keys,
  * @brief Indicates if a set of aggregation requests can be satisfied with a
  * hash-based groupby implementation.
  *
- * @param keys The table of keys
  * @param requests The set of columns to aggregate and the aggregations to
  * perform
  * @return true A hash-based groupby should be used
  * @return false A hash-based groupby should not be used
  */
-bool can_use_hash_groupby(table_view const& keys, host_span<aggregation_request const> requests)
+bool can_use_hash_groupby(host_span<aggregation_request const> requests)
 {
   return std::all_of(requests.begin(), requests.end(), [](aggregation_request const& r) {
     // Currently, structs are not supported in any of hash-based aggregations.
@@ -667,10 +653,18 @@ std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby(
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr)
 {
+  auto const has_nested_column =
+    std::any_of(keys.begin(), keys.end(), [](cudf::column_view const& col) {
+      return cudf::is_nested(col.type());
+    });
+  if (has_nested_column and include_null_keys == cudf::null_policy::EXCLUDE) {
+    CUDF_FAIL("Null keys of nested type cannot be excluded.");
+  }
+
   cudf::detail::result_cache cache(requests.size());
 
   std::unique_ptr<table> unique_keys =
-    groupby(keys, requests, &cache, has_nulls(keys), include_null_keys, stream, mr);
+    groupby(keys, requests, &cache, cudf::has_nulls(keys), include_null_keys, stream, mr);
 
   return std::pair(std::move(unique_keys), extract_results(requests, cache, stream, mr));
 }
diff --git a/cpp/src/groupby/hash/groupby_kernels.cuh b/cpp/src/groupby/hash/groupby_kernels.cuh
index 79286fb3839..eedb07200a5 100644
--- a/cpp/src/groupby/hash/groupby_kernels.cuh
+++ b/cpp/src/groupby/hash/groupby_kernels.cuh
@@ -65,7 +65,6 @@ namespace hash {
 template <typename Map>
 struct compute_single_pass_aggs_fn {
   Map map;
-  size_type num_keys;
   table_device_view input_values;
   mutable_table_device_view output_values;
   aggregation::Kind const* __restrict__ aggs;
@@ -76,7 +75,6 @@ struct compute_single_pass_aggs_fn {
    * @brief Construct a new compute_single_pass_aggs_fn functor object
    *
    * @param map Hash map object to insert key,value pairs into.
-   * @param num_keys The number of rows in input keys table
    * @param input_values The table whose rows will be aggregated in the values
    * of the hash map
    * @param output_values Table that stores the results of aggregating rows of
@@ -90,14 +88,12 @@ struct compute_single_pass_aggs_fn {
    * bitmask where bit `i` indicates the presence of a null value in row `i`.
    */
   compute_single_pass_aggs_fn(Map map,
-                              size_type num_keys,
                               table_device_view input_values,
                               mutable_table_device_view output_values,
                               aggregation::Kind const* aggs,
                               bitmask_type const* row_bitmask,
                               bool skip_rows_with_nulls)
     : map(map),
-      num_keys(num_keys),
       input_values(input_values),
       output_values(output_values),
       aggs(aggs),
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index eadcd985de3..c85b10b4eb8 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -83,7 +83,7 @@ ConfigureTest(
   groupby/covariance_tests.cpp
   groupby/groups_tests.cpp
   groupby/keys_tests.cpp
-  groupby/lists_tests.cpp
+  groupby/lists_tests.cu
   groupby/m2_tests.cpp
   groupby/min_tests.cpp
   groupby/max_scan_tests.cpp
diff --git a/cpp/tests/groupby/keys_tests.cpp b/cpp/tests/groupby/keys_tests.cpp
index 94c26f3fe8f..19e82c4ffd1 100644
--- a/cpp/tests/groupby/keys_tests.cpp
+++ b/cpp/tests/groupby/keys_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -242,6 +242,82 @@ TYPED_TEST(groupby_keys_test, mismatch_num_rows)
                             "Size mismatch between request values and groupby keys.");
 }
 
+template <typename T>
+using FWCW = cudf::test::fixed_width_column_wrapper<T>;
+
+TYPED_TEST(groupby_keys_test, structs)
+{
+  using V = TypeParam;
+
+  using R       = cudf::detail::target_type_t<int, aggregation::ARGMAX>;
+  using STRINGS = cudf::test::strings_column_wrapper;
+  using STRUCTS = cudf::test::structs_column_wrapper;
+
+  if (std::is_same_v<V, bool>) return;
+
+  /*
+    `@` indicates null
+       keys:                values:
+       /+----------------+
+       |s1{s2{a,b},   c}|
+       +-----------------+
+     0 |  { { 1, 1}, "a"}|  1
+     1 |  { { 1, 2}, "b"}|  2
+     2 |  {@{ 2, 1}, "c"}|  3
+     3 |  {@{ 2, 1}, "c"}|  4
+     4 | @{ { 2, 2}, "d"}|  5
+     5 | @{ { 2, 2}, "d"}|  6
+     6 |  { { 1, 1}, "a"}|  7
+     7 |  {@{ 2, 1}, "c"}|  8
+     8 |  { {@1, 1}, "a"}|  9
+       +-----------------+
+  */
+
+  // clang-format off
+  auto col_a = FWCW<V>{{ 1,   1,   2,   2,   2,   2,   1,   2,   1 }, null_at(8)};
+  auto col_b = FWCW<V> { 1,   2,   1,   1,   2,   2,   1,   1,   1 };
+  auto col_c = STRINGS {"a", "b", "c", "c", "d", "d", "a", "c", "a"};
+  // clang-format on
+  auto s2 = STRUCTS{{col_a, col_b}, nulls_at({2, 3, 7})};
+
+  auto keys = STRUCTS{{s2, col_c}, nulls_at({4, 5})};
+  auto vals = FWCW<int>{1, 2, 3, 4, 5, 6, 7, 8, 9};
+
+  // clang-format off
+  auto expected_col_a = FWCW<V>{{1,   1,   1,   2 }, null_at(2)};
+  auto expected_col_b = FWCW<V>{ 1,   2,   1,   1 };
+  auto expected_col_c = STRINGS{"a", "b", "a", "c"};
+  // clang-format on
+  auto expected_s2 = STRUCTS{{expected_col_a, expected_col_b}, null_at(3)};
+
+  auto expect_keys = STRUCTS{{expected_s2, expected_col_c}, no_nulls()};
+  auto expect_vals = FWCW<R>{6, 1, 8, 7};
+
+  auto agg = cudf::make_argmax_aggregation<groupby_aggregation>();
+  EXPECT_THROW(test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)),
+               cudf::logic_error);
+}
+
+template <typename T>
+using LCW = cudf::test::lists_column_wrapper<T, int32_t>;
+
+TYPED_TEST(groupby_keys_test, lists)
+{
+  using R = cudf::detail::target_type_t<int32_t, aggregation::SUM>;
+
+  // clang-format off
+  auto keys   = LCW<TypeParam> { {1,1}, {2,2}, {3,3}, {1,1}, {2,2} };
+  auto values = FWCW<int32_t>  {    0,     1,     2,     3,     4  };
+
+  auto expected_keys   = LCW<TypeParam> { {1,1}, {2,2}, {3,3} };
+  auto expected_values = FWCW<R>        {    3,     5,     2  };
+  // clang-format on
+
+  auto agg = cudf::make_sum_aggregation<groupby_aggregation>();
+  EXPECT_THROW(test_single_agg(keys, values, expected_keys, expected_values, std::move(agg)),
+               cudf::logic_error);
+}
+
 struct groupby_string_keys_test : public cudf::test::BaseFixture {
 };
 
diff --git a/cpp/tests/groupby/lists_tests.cpp b/cpp/tests/groupby/lists_tests.cpp
deleted file mode 100644
index 11b8ffa92b9..00000000000
--- a/cpp/tests/groupby/lists_tests.cpp
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright (c) 2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <tests/groupby/groupby_test_util.hpp>
-
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/iterator_utilities.hpp>
-#include <cudf_test/type_lists.hpp>
-
-#include <cudf/detail/aggregation/aggregation.hpp>
-
-namespace cudf {
-namespace test {
-
-template <typename V>
-struct groupby_lists_test : public cudf::test::BaseFixture {
-};
-
-TYPED_TEST_SUITE(groupby_lists_test, cudf::test::FixedWidthTypes);
-
-namespace {
-// Checking with a single aggregation, and aggregation column.
-// This test is orthogonal to the aggregation type; it focuses on testing the grouping
-// with LISTS keys.
-auto sum_agg() { return cudf::make_sum_aggregation<groupby_aggregation>(); }
-
-void test_sort_based_sum_agg(column_view const& keys, column_view const& values)
-{
-  test_single_agg(
-    keys, values, keys, values, sum_agg(), force_use_sort_impl::YES, null_policy::INCLUDE);
-}
-
-void test_hash_based_sum_agg(column_view const& keys, column_view const& values)
-{
-  test_single_agg(
-    keys, values, keys, values, sum_agg(), force_use_sort_impl::NO, null_policy::INCLUDE);
-}
-
-}  // namespace
-
-TYPED_TEST(groupby_lists_test, top_level_lists_are_unsupported)
-{
-  // Test that grouping on LISTS columns fails visibly.
-
-  // clang-format off
-  auto keys   = lists_column_wrapper<TypeParam, int32_t> { {1,1},  {2,2},  {3,3},   {1,1},   {2,2} };
-  auto values = fixed_width_column_wrapper<int32_t>      {     0,      1,      2,      3,       4  };
-  // clang-format on
-
-  EXPECT_THROW(test_sort_based_sum_agg(keys, values), cudf::logic_error);
-  EXPECT_THROW(test_hash_based_sum_agg(keys, values), cudf::logic_error);
-}
-
-}  // namespace test
-}  // namespace cudf
diff --git a/cpp/tests/groupby/lists_tests.cu b/cpp/tests/groupby/lists_tests.cu
new file mode 100644
index 00000000000..7c145271662
--- /dev/null
+++ b/cpp/tests/groupby/lists_tests.cu
@@ -0,0 +1,214 @@
+/*
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "groupby_test_util.hpp"
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/concatenate.hpp>
+#include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/table/experimental/row_operators.cuh>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/logical.h>
+
+#include <vector>
+
+namespace cudf {
+namespace test {
+
+template <typename V>
+struct groupby_lists_test : public cudf::test::BaseFixture {
+};
+
+TYPED_TEST_SUITE(groupby_lists_test, cudf::test::FixedWidthTypes);
+
+using namespace cudf::test::iterators;
+
+using R = cudf::detail::target_type_t<int32_t, aggregation::SUM>;  // Type of aggregation result.
+using strings = strings_column_wrapper;
+using structs = structs_column_wrapper;
+
+template <typename T>
+using fwcw = cudf::test::fixed_width_column_wrapper<T>;
+
+template <typename T>
+using lcw = cudf::test::lists_column_wrapper<T, int32_t>;
+
+namespace {
+static constexpr auto null = -1;
+
+// Checking with a single aggregation, and aggregation column.
+// This test is orthogonal to the aggregation type; it focuses on testing the grouping
+// with LISTS keys.
+auto sum_agg() { return cudf::make_sum_aggregation<groupby_aggregation>(); }
+
+// TODO: this is a naive way to compare expected key/value against resulting key/value. To be
+// replaced once list lex comparator is supported (https://github.com/rapidsai/cudf/issues/5890)
+template <typename Equal>
+struct match_expected_fn {
+  match_expected_fn(cudf::size_type const num_rows, Equal equal)
+    : _num_rows{num_rows}, _equal{equal}
+  {
+  }
+
+  __device__ bool operator()(cudf::size_type const idx)
+  {
+    for (auto i = _num_rows; i < 2 * _num_rows; i++) {
+      if (_equal(idx, i)) { return true; }
+    }
+    return false;
+  }
+
+  cudf::size_type const _num_rows;
+  Equal _equal;
+};
+
+inline void test_hash_based_sum_agg(column_view const& keys,
+                                    column_view const& values,
+                                    column_view const& expect_keys,
+                                    column_view const& expect_vals)
+{
+  auto const include_null_keys = null_policy::INCLUDE;
+  auto const keys_are_sorted   = sorted::NO;
+
+  std::vector<groupby::aggregation_request> requests;
+  auto& request  = requests.emplace_back(groupby::aggregation_request());
+  request.values = values;
+  request.aggregations.push_back(std::move(cudf::make_sum_aggregation<groupby_aggregation>()));
+
+  groupby::groupby gb_obj(cudf::table_view({keys}), include_null_keys, keys_are_sorted);
+
+  auto result = gb_obj.aggregate(requests);
+
+  cudf::table_view result_kv{
+    {result.first->get_column(0).view(), result.second[0].results[0]->view()}};
+  cudf::table_view expected_kv{{expect_keys, expect_vals}};
+
+  auto const num_rows = result_kv.num_rows();
+  EXPECT_EQ(num_rows, expected_kv.num_rows());
+
+  // Concatenate expected table and resulting table into one unique table `t`:
+  // expected table:  `t [       0,     num_rows - 1]`
+  // resulting table: `t [num_rows, 2 * num_rows - 1]`
+  auto combined_table = cudf::concatenate(std::vector{expected_kv, result_kv});
+  auto preprocessed_t = cudf::experimental::row::hash::preprocessed_table::create(
+    combined_table->view(), rmm::cuda_stream_default);
+  cudf::experimental::row::equality::self_comparator comparator(preprocessed_t);
+
+  auto const null_keys_are_equal =
+    include_null_keys == null_policy::INCLUDE ? null_equality::EQUAL : null_equality::UNEQUAL;
+  auto row_equal = comparator.device_comparator(nullate::DYNAMIC{true}, null_keys_are_equal);
+  auto func      = match_expected_fn{num_rows, row_equal};
+
+  // For each row in expected table `t[0, num_rows)`, there must be a match
+  // in the resulting table `t[num_rows, 2 * num_rows)`
+  EXPECT_TRUE(thrust::all_of(thrust::make_counting_iterator<cudf::size_type>(0),
+                             thrust::make_counting_iterator<cudf::size_type>(num_rows),
+                             func));
+}
+
+void test_sort_based_sum_agg(column_view const& keys,
+                             column_view const& values,
+                             column_view const& expect_keys,
+                             column_view const& expect_vals)
+{
+  test_single_agg(keys,
+                  values,
+                  expect_keys,
+                  expect_vals,
+                  sum_agg(),
+                  force_use_sort_impl::YES,
+                  null_policy::INCLUDE);
+}
+
+void test_sum_agg(column_view const& keys,
+                  column_view const& values,
+                  column_view const& expected_keys,
+                  column_view const& expected_values)
+{
+  EXPECT_THROW(test_sort_based_sum_agg(keys, values, expected_keys, expected_values),
+               cudf::logic_error);
+  test_hash_based_sum_agg(keys, values, expected_keys, expected_values);
+}
+}  // namespace
+
+TYPED_TEST(groupby_lists_test, basic)
+{
+  if (std::is_same_v<TypeParam, bool>) { return; }
+
+  // clang-format off
+  auto keys   = lcw<TypeParam> { {1,1}, {2,2}, {3,3}, {1,1}, {2,2} };
+  auto values = fwcw<int32_t>  {    0,     1,     2,     3,     4  };
+
+  auto expected_keys   = lcw<TypeParam> { {1,1}, {2,2}, {3,3} };
+  auto expected_values = fwcw<R>        {    3,     5,     2  };
+  // clang-format on
+
+  test_sum_agg(keys, values, expected_keys, expected_values);
+}
+
+TYPED_TEST(groupby_lists_test, all_null_input)
+{
+  // clang-format off
+  auto keys   = lcw<TypeParam> { {{1,1}, {2,2}, {3,3}, {1,1}, {2,2}}, all_nulls()};
+  auto values = fwcw<int32_t>  {     0,     1,     2,     3,     4 };
+
+  auto expected_keys   = lcw<TypeParam> { {{null,null}}, all_nulls()};
+  auto expected_values = fwcw<R>        {          10 };
+  // clang-format on
+
+  test_sum_agg(keys, values, expected_keys, expected_values);
+}
+
+TYPED_TEST(groupby_lists_test, lists_with_nulls)
+{
+  // clang-format off
+  auto keys   = lcw<TypeParam> { {{1,1}, {2,2}, {3,3}, {1,1}, {2,2}}, nulls_at({1,2,4})};
+  auto values = fwcw<int32_t>  {     0,     1,     2,     3,     4 };
+
+  auto expected_keys   = lcw<TypeParam> { {{1,1}, {null,null}}, null_at(1)};
+  auto expected_values = fwcw<R>        {     3,           7 };
+  // clang-format on
+
+  test_sum_agg(keys, values, expected_keys, expected_values);
+}
+
+TYPED_TEST(groupby_lists_test, lists_with_null_elements)
+{
+  auto keys =
+    lcw<TypeParam>{{lcw<TypeParam>{{{1, 2, 3}, {}, {4, 5}, {}, {6, 0}}, nulls_at({1, 3})},
+                    lcw<TypeParam>{{{1, 2, 3}, {}, {4, 5}, {}, {6, 0}}, nulls_at({1, 3})},
+                    lcw<TypeParam>{{{1, 2, 3}, {}, {4, 5}, {}, {6, 0}}, nulls_at({1, 3})},
+                    lcw<TypeParam>{{{1, 2, 3}, {}, {4, 5}, {}, {6, 0}}, nulls_at({1, 3})}},
+                   nulls_at({2, 3})};
+  auto values = fwcw<int32_t>{1, 2, 4, 5};
+
+  auto expected_keys = lcw<TypeParam>{
+    {lcw<TypeParam>{{{1, 2, 3}, {}, {4, 5}, {}, {6, 0}}, nulls_at({1, 3})}, {}}, null_at(1)};
+  auto expected_values = fwcw<R>{3, 9};
+
+  test_sum_agg(keys, values, expected_keys, expected_values);
+}
+}  // namespace test
+}  // namespace cudf