From 4c272df8f2ee753f26e088bb4d95fff28d731bbf Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Wed, 23 Jun 2021 10:31:28 -0600
Subject: [PATCH 01/19] Add aggregation definitions

---
 cpp/include/cudf/aggregation.hpp | 70 ++++++++++++++++++++++++--------
 1 file changed, 53 insertions(+), 17 deletions(-)
diff --git a/cpp/include/cudf/aggregation.hpp b/cpp/include/cudf/aggregation.hpp
index 5fab284d506..995eae7a4a6 100644
--- a/cpp/include/cudf/aggregation.hpp
+++ b/cpp/include/cudf/aggregation.hpp
@@ -78,12 +78,14 @@ class aggregation {
     ROW_NUMBER,      ///< get row-number of current index (relative to rolling window)
     COLLECT_LIST,    ///< collect values into a list
     COLLECT_SET,     ///< collect values into a list without duplicate entries
-    MERGE_LISTS,     ///< merge multiple lists values into one list
-    MERGE_SETS,      ///< merge multiple lists values into one list then drop duplicate entries
     LEAD,            ///< window function, accesses row at specified offset following current row
     LAG,             ///< window function, accesses row at specified offset preceding current row
     PTX,             ///< PTX  UDF based reduction
-    CUDA             ///< CUDA UDF based reduction
+    CUDA,            ///< CUDA UDF based reduction
+    MERGE_LISTS,     ///< merge multiple lists values into one list
+    MERGE_SETS,      ///< merge multiple lists values into one list then drop duplicate entries
+    MERGE_VARIANCE,  ///< merge partial variance values
+    MERGE_STD        ///< merge partial standard deviation values
   };
 
   aggregation() = delete;
@@ -271,6 +273,28 @@ std::unique_ptr<Base> make_collect_set_aggregation(null_policy null_handling = n
                                                    null_equality nulls_equal = null_equality::EQUAL,
                                                    nan_equality nans_equal = nan_equality::UNEQUAL);
 
+/// Factory to create a LAG aggregation
+template <typename Base = aggregation>
+std::unique_ptr<Base> make_lag_aggregation(size_type offset);
+
+/// Factory to create a LEAD aggregation
+template <typename Base = aggregation>
+std::unique_ptr<Base> make_lead_aggregation(size_type offset);
+
+/**
+ * @brief Factory to create an aggregation base on UDF for PTX or CUDA
+ *
+ * @param[in] type: either udf_type::PTX or udf_type::CUDA
+ * @param[in] user_defined_aggregator A string containing the aggregator code
+ * @param[in] output_type expected output type
+ *
+ * @return aggregation unique pointer housing user_defined_aggregator string.
+ */
+template <typename Base = aggregation>
+std::unique_ptr<Base> make_udf_aggregation(udf_type type,
+                                           std::string const& user_defined_aggregator,
+                                           data_type output_type);
+
 /**
  * @brief Factory to create a MERGE_LISTS aggregation.
  *
@@ -308,27 +332,39 @@ template <typename Base = aggregation>
 std::unique_ptr<Base> make_merge_sets_aggregation(null_equality nulls_equal = null_equality::EQUAL,
                                                   nan_equality nans_equal = nan_equality::UNEQUAL);
 
-/// Factory to create a LAG aggregation
-template <typename Base = aggregation>
-std::unique_ptr<Base> make_lag_aggregation(size_type offset);
-
-/// Factory to create a LEAD aggregation
+/**
+ * @brief Factory to create a MERGE_VARIANCE aggregation
+ *
+ * This aggregation is designed specificly to perform distributed computing of `VARIANCE`
+ * aggregation. The partial results input to this aggregation is generated by two groupby
+ * aggregations: `VARIANCE` and `COUNT_VALID`.
+ *
+ * In order to use this aggregation, the `aggregation_request` array input to `groupby::aggregate`
+ * must contain at least two requests:
+ *  - A `COLLECT_LIST` request for collecting the partial results of `COUNT_VALID`, and
+ *  - This `MERGE_VARIANCE` request, which must be given AFTER the `COLLECT_LIST` request above
+ *
+ * Since the partial results output from `VARIANCE` and `COUNT_VALID` do not contain nulls, the
+ * input values to those two aggregations must be non-nullable.
+ *
+ * @param ddof Delta degrees of freedom. The divisor used in calculation of `variance` is
+ *        `N - ddof`, where `N` is the population size.
+ */
 template <typename Base = aggregation>
-std::unique_ptr<Base> make_lead_aggregation(size_type offset);
+std::unique_ptr<Base> make_merge_variance_aggregation(size_type ddof = 1);
 
 /**
- * @brief Factory to create an aggregation base on UDF for PTX or CUDA
+ * @brief Factory to create a MERGE_STD aggregation
  *
- * @param[in] type: either udf_type::PTX or udf_type::CUDA
- * @param[in] user_defined_aggregator A string containing the aggregator code
- * @param[in] output_type expected output type
+ * This aggregation is designed specificly to perform distributed computing of `STD`
+ * aggregation. The partial results input to this aggregation and its usage are the same as of
+ * `MERGE_VARIANCE` aggregation.
  *
- * @return aggregation unique pointer housing user_defined_aggregator string.
+ * @param ddof Delta degrees of freedom. The divisor used in calculation of `variance` is
+ *        `N - ddof`, where `N` is the population size.
  */
 template <typename Base = aggregation>
-std::unique_ptr<Base> make_udf_aggregation(udf_type type,
-                                           std::string const& user_defined_aggregator,
-                                           data_type output_type);
+std::unique_ptr<Base> make_merge_std_aggregation(size_type ddof = 1);
 
 /** @} */  // end of group
 }  // namespace cudf

From d446076ebe63cbbeec7d82ecfa6ce32a568b309c Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Thu, 24 Jun 2021 15:26:00 -0600
Subject: [PATCH 02/19] Adding new aggregations: M2, MERGE_VARIANCES,
 MERGE_STDS

---
 cpp/CMakeLists.txt                            |   1 +
 cpp/include/cudf/aggregation.hpp              |  90 ++++---
 .../cudf/detail/aggregation/aggregation.hpp   | 249 ++++++++++++------
 cpp/src/aggregation/aggregation.cpp           | 147 +++++++----
 cpp/src/groupby/sort/aggregate.cpp            | 122 ++++++++-
 cpp/src/groupby/sort/group_merge_variances.cu |  74 ++++++
 cpp/src/groupby/sort/group_reductions.hpp     |  65 +++++
 cpp/src/groupby/sort/group_std.cu             |  88 ++++++-
 8 files changed, 657 insertions(+), 179 deletions(-)
 create mode 100644 cpp/src/groupby/sort/group_merge_variances.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 678f202d106..05c3028067e 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -199,6 +199,7 @@ add_library(cudf
     src/groupby/sort/aggregate.cpp
     src/groupby/sort/group_collect.cu
     src/groupby/sort/group_merge_lists.cu
+    src/groupby/sort/group_merge_variances.cu
     src/groupby/sort/group_count.cu
     src/groupby/sort/group_max.cu
     src/groupby/sort/group_min.cu
diff --git a/cpp/include/cudf/aggregation.hpp b/cpp/include/cudf/aggregation.hpp
index 995eae7a4a6..5913d39e020 100644
--- a/cpp/include/cudf/aggregation.hpp
+++ b/cpp/include/cudf/aggregation.hpp
@@ -57,35 +57,36 @@ class aggregation {
    * @brief Possible aggregation operations
    */
   enum Kind {
-    SUM,             ///< sum reduction
-    PRODUCT,         ///< product reduction
-    MIN,             ///< min reduction
-    MAX,             ///< max reduction
-    COUNT_VALID,     ///< count number of valid elements
-    COUNT_ALL,       ///< count number of elements
-    ANY,             ///< any reduction
-    ALL,             ///< all reduction
-    SUM_OF_SQUARES,  ///< sum of squares reduction
-    MEAN,            ///< arithmetic mean reduction
-    VARIANCE,        ///< groupwise variance
-    STD,             ///< groupwise standard deviation
-    MEDIAN,          ///< median reduction
-    QUANTILE,        ///< compute specified quantile(s)
-    ARGMAX,          ///< Index of max element
-    ARGMIN,          ///< Index of min element
-    NUNIQUE,         ///< count number of unique elements
-    NTH_ELEMENT,     ///< get the nth element
-    ROW_NUMBER,      ///< get row-number of current index (relative to rolling window)
-    COLLECT_LIST,    ///< collect values into a list
-    COLLECT_SET,     ///< collect values into a list without duplicate entries
-    LEAD,            ///< window function, accesses row at specified offset following current row
-    LAG,             ///< window function, accesses row at specified offset preceding current row
-    PTX,             ///< PTX  UDF based reduction
-    CUDA,            ///< CUDA UDF based reduction
-    MERGE_LISTS,     ///< merge multiple lists values into one list
-    MERGE_SETS,      ///< merge multiple lists values into one list then drop duplicate entries
-    MERGE_VARIANCE,  ///< merge partial variance values
-    MERGE_STD        ///< merge partial standard deviation values
+    SUM,              ///< sum reduction
+    PRODUCT,          ///< product reduction
+    MIN,              ///< min reduction
+    MAX,              ///< max reduction
+    COUNT_VALID,      ///< count number of valid elements
+    COUNT_ALL,        ///< count number of elements
+    ANY,              ///< any reduction
+    ALL,              ///< all reduction
+    SUM_OF_SQUARES,   ///< sum of squares reduction
+    MEAN,             ///< arithmetic mean reduction
+    M2,               ///< groupwise sum of squares of differences from the current mean
+    VARIANCE,         ///< groupwise variance
+    STD,              ///< groupwise standard deviation
+    MEDIAN,           ///< median reduction
+    QUANTILE,         ///< compute specified quantile(s)
+    ARGMAX,           ///< Index of max element
+    ARGMIN,           ///< Index of min element
+    NUNIQUE,          ///< count number of unique elements
+    NTH_ELEMENT,      ///< get the nth element
+    ROW_NUMBER,       ///< get row-number of current index (relative to rolling window)
+    COLLECT_LIST,     ///< collect values into a list
+    COLLECT_SET,      ///< collect values into a list without duplicate entries
+    LEAD,             ///< window function, accesses row at specified offset following current row
+    LAG,              ///< window function, accesses row at specified offset preceding current row
+    PTX,              ///< PTX  UDF based reduction
+    CUDA,             ///< CUDA UDF based reduction
+    MERGE_LISTS,      ///< merge multiple lists values into one list
+    MERGE_SETS,       ///< merge multiple lists values into one list then drop duplicate entries
+    MERGE_VARIANCES,  ///< merge partial variance values
+    MERGE_STDS        ///< merge partial standard deviation values
   };
 
   aggregation() = delete;
@@ -161,6 +162,16 @@ std::unique_ptr<Base> make_sum_of_squares_aggregation();
 template <typename Base = aggregation>
 std::unique_ptr<Base> make_mean_aggregation();
 
+/**
+ * @brief Factory to create a M2 aggregation
+ *
+ * A M2 aggregation is groupwise sum of squares of differences from the current mean. From this,
+ * a `VARIANCE` aggregation can be computed as `M2 / (N - ddof)`, where `N` is the population size
+ * and `ddof` is the delta degrees of freedom.
+ */
+template <typename Base = aggregation>
+std::unique_ptr<Base> make_m2_aggregation();
+
 /**
  * @brief Factory to create a VARIANCE aggregation
  *
@@ -333,7 +344,7 @@ std::unique_ptr<Base> make_merge_sets_aggregation(null_equality nulls_equal = nu
                                                   nan_equality nans_equal = nan_equality::UNEQUAL);
 
 /**
- * @brief Factory to create a MERGE_VARIANCE aggregation
+ * @brief Factory to create a MERGE_VARIANCES aggregation
  *
  * This aggregation is designed specificly to perform distributed computing of `VARIANCE`
  * aggregation. The partial results input to this aggregation is generated by two groupby
@@ -341,30 +352,35 @@ std::unique_ptr<Base> make_merge_sets_aggregation(null_equality nulls_equal = nu
  *
  * In order to use this aggregation, the `aggregation_request` array input to `groupby::aggregate`
  * must contain at least two requests:
- *  - A `COLLECT_LIST` request for collecting the partial results of `COUNT_VALID`, and
- *  - This `MERGE_VARIANCE` request, which must be given AFTER the `COLLECT_LIST` request above
+ *  - A request for `COLLECT_LIST` aggregation to collect the partial results of `COUNT_VALID`
+ *  - This `MERGE_VARIANCES` request, which must be given AFTER the request above so that it can
+ *    access the cached results generated by that request
+ *
+ * For a merging operation that is not a final merge (i.e., its outputs will be used as input to
+ * perform another `MERGE_VARIANCES` aggregation), a `SUM` aggregation must also be added to the
+ * same request for `COLLECT_LIST` above to produce the merged values for `COUNT_VALID`.
  *
  * Since the partial results output from `VARIANCE` and `COUNT_VALID` do not contain nulls, the
- * input values to those two aggregations must be non-nullable.
+ * input values columns to those two requests must be non-nullable.
  *
  * @param ddof Delta degrees of freedom. The divisor used in calculation of `variance` is
  *        `N - ddof`, where `N` is the population size.
  */
 template <typename Base = aggregation>
-std::unique_ptr<Base> make_merge_variance_aggregation(size_type ddof = 1);
+std::unique_ptr<Base> make_merge_variances_aggregation(size_type ddof = 1);
 
 /**
- * @brief Factory to create a MERGE_STD aggregation
+ * @brief Factory to create a MERGE_STDS aggregation
  *
  * This aggregation is designed specificly to perform distributed computing of `STD`
  * aggregation. The partial results input to this aggregation and its usage are the same as of
- * `MERGE_VARIANCE` aggregation.
+ * `MERGE_VARIANCES` aggregation.
  *
  * @param ddof Delta degrees of freedom. The divisor used in calculation of `variance` is
  *        `N - ddof`, where `N` is the population size.
  */
 template <typename Base = aggregation>
-std::unique_ptr<Base> make_merge_std_aggregation(size_type ddof = 1);
+std::unique_ptr<Base> make_merge_stds_aggregation(size_type ddof = 1);
 
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp
index 373d695a5b5..ab2ef947cba 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.hpp
+++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp
@@ -53,6 +53,8 @@ class simple_aggregations_collector {  // Declares the interface for the simple
     data_type col_type, class sum_of_squares_aggregation const& agg);
   virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
                                                           class mean_aggregation const& agg);
+  virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
+                                                          class m2_aggregation const& agg);
   virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
                                                           class var_aggregation const& agg);
   virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
@@ -75,14 +77,18 @@ class simple_aggregations_collector {  // Declares the interface for the simple
     data_type col_type, class collect_list_aggregation const& agg);
   virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
                                                           class collect_set_aggregation const& agg);
+  virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
+                                                          class lead_lag_aggregation const& agg);
+  virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
+                                                          class udf_aggregation const& agg);
   virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
                                                           class merge_lists_aggregation const& agg);
   virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
                                                           class merge_sets_aggregation const& agg);
+  virtual std::vector<std::unique_ptr<aggregation>> visit(
+    data_type col_type, class merge_variances_aggregation const& agg);
   virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
-                                                          class lead_lag_aggregation const& agg);
-  virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
-                                                          class udf_aggregation const& agg);
+                                                          class merge_stds_aggregation const& agg);
 };
 
 class aggregation_finalizer {  // Declares the interface for the finalizer
@@ -98,6 +104,7 @@ class aggregation_finalizer {  // Declares the interface for the finalizer
   virtual void visit(class all_aggregation const& agg);
   virtual void visit(class sum_of_squares_aggregation const& agg);
   virtual void visit(class mean_aggregation const& agg);
+  virtual void visit(class m2_aggregation const& agg);
   virtual void visit(class var_aggregation const& agg);
   virtual void visit(class std_aggregation const& agg);
   virtual void visit(class median_aggregation const& agg);
@@ -109,10 +116,12 @@ class aggregation_finalizer {  // Declares the interface for the finalizer
   virtual void visit(class row_number_aggregation const& agg);
   virtual void visit(class collect_list_aggregation const& agg);
   virtual void visit(class collect_set_aggregation const& agg);
-  virtual void visit(class merge_lists_aggregation const& agg);
-  virtual void visit(class merge_sets_aggregation const& agg);
   virtual void visit(class lead_lag_aggregation const& agg);
   virtual void visit(class udf_aggregation const& agg);
+  virtual void visit(class merge_lists_aggregation const& agg);
+  virtual void visit(class merge_sets_aggregation const& agg);
+  virtual void visit(class merge_variances_aggregation const& agg);
+  virtual void visit(class merge_stds_aggregation const& agg);
 };
 
 /**
@@ -286,6 +295,25 @@ class mean_aggregation final : public rolling_aggregation {
   void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
 };
 
+/**
+ * @brief Derived class for specifying a m2 aggregation
+ */
+class m2_aggregation : public aggregation {
+ public:
+  m2_aggregation() : aggregation{M2} {}
+
+  std::unique_ptr<aggregation> clone() const override
+  {
+    return std::make_unique<m2_aggregation>(*this);
+  }
+  std::vector<std::unique_ptr<aggregation>> get_simple_aggregations(
+    data_type col_type, cudf::detail::simple_aggregations_collector& collector) const override
+  {
+    return collector.visit(col_type, *this);
+  }
+  void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
+};
+
 /**
  * @brief Derived class for specifying a standard deviation/variance aggregation
  */
@@ -305,8 +333,9 @@ class std_var_aggregation : public aggregation {
  protected:
   std_var_aggregation(aggregation::Kind k, size_type ddof) : aggregation(k), _ddof{ddof}
   {
-    CUDF_EXPECTS(k == aggregation::STD or k == aggregation::VARIANCE,
-                 "std_var_aggregation can accept only STD, VARIANCE");
+    CUDF_EXPECTS(k == aggregation::STD or k == aggregation::VARIANCE or
+                   k == aggregation::MERGE_STDS or k == aggregation::MERGE_VARIANCES,
+                 "std_var_aggregation can accept only STD, VARIANCE, MERGE_STDS, MERGE_VARIANCES");
   }
 
   size_type hash_impl() const { return std::hash<size_type>{}(_ddof); }
@@ -634,7 +663,97 @@ class collect_set_aggregation final : public rolling_aggregation {
 };
 
 /**
- * @brief Derived aggregation class for specifying MERGE_LISTs aggregation
+ * @brief Derived aggregation class for specifying LEAD/LAG window aggregations
+ */
+class lead_lag_aggregation final : public rolling_aggregation {
+ public:
+  lead_lag_aggregation(Kind kind, size_type offset)
+    : aggregation{offset < 0 ? (kind == LAG ? LEAD : LAG) : kind}, row_offset{std::abs(offset)}
+  {
+  }
+
+  bool is_equal(aggregation const& _other) const override
+  {
+    if (!this->aggregation::is_equal(_other)) { return false; }
+    auto const& other = dynamic_cast<lead_lag_aggregation const&>(_other);
+    return (row_offset == other.row_offset);
+  }
+
+  size_t do_hash() const override { return this->aggregation::do_hash() ^ hash_impl(); }
+
+  std::unique_ptr<aggregation> clone() const override
+  {
+    return std::make_unique<lead_lag_aggregation>(*this);
+  }
+  std::vector<std::unique_ptr<aggregation>> get_simple_aggregations(
+    data_type col_type, cudf::detail::simple_aggregations_collector& collector) const override
+  {
+    return collector.visit(col_type, *this);
+  }
+  void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
+
+  size_type row_offset;
+
+ private:
+  size_t hash_impl() const { return std::hash<size_type>()(row_offset); }
+};
+
+/**
+ * @brief Derived class for specifying a custom aggregation
+ * specified in udf
+ */
+class udf_aggregation final : public rolling_aggregation {
+ public:
+  udf_aggregation(aggregation::Kind type,
+                  std::string const& user_defined_aggregator,
+                  data_type output_type)
+    : aggregation{type},
+      _source{user_defined_aggregator},
+      _operator_name{(type == aggregation::PTX) ? "rolling_udf_ptx" : "rolling_udf_cuda"},
+      _function_name{"rolling_udf"},
+      _output_type{output_type}
+  {
+    CUDF_EXPECTS(type == aggregation::PTX or type == aggregation::CUDA,
+                 "udf_aggregation can accept only PTX, CUDA");
+  }
+
+  bool is_equal(aggregation const& _other) const override
+  {
+    if (!this->aggregation::is_equal(_other)) { return false; }
+    auto const& other = dynamic_cast<udf_aggregation const&>(_other);
+    return (_source == other._source and _operator_name == other._operator_name and
+            _function_name == other._function_name and _output_type == other._output_type);
+  }
+
+  size_t do_hash() const override { return this->aggregation::do_hash() ^ hash_impl(); }
+
+  std::unique_ptr<aggregation> clone() const override
+  {
+    return std::make_unique<udf_aggregation>(*this);
+  }
+  std::vector<std::unique_ptr<aggregation>> get_simple_aggregations(
+    data_type col_type, cudf::detail::simple_aggregations_collector& collector) const override
+  {
+    return collector.visit(col_type, *this);
+  }
+  void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
+
+  std::string const _source;
+  std::string const _operator_name;
+  std::string const _function_name;
+  data_type _output_type;
+
+ protected:
+  size_t hash_impl() const
+  {
+    return std::hash<std::string>{}(_source) ^ std::hash<std::string>{}(_operator_name) ^
+           std::hash<std::string>{}(_function_name) ^
+           std::hash<int>{}(static_cast<int32_t>(_output_type.id()));
+  }
+};
+
+/**
+ * @brief Derived aggregation class for specifying MERGE_LISTS aggregation
  */
 class merge_lists_aggregation final : public aggregation {
  public:
@@ -653,7 +772,7 @@ class merge_lists_aggregation final : public aggregation {
 };
 
 /**
- * @brief Derived aggregation class for specifying MERGE_SETs aggregation
+ * @brief Derived aggregation class for specifying MERGE_SETS aggregation
  */
 class merge_sets_aggregation final : public aggregation {
  public:
@@ -694,27 +813,18 @@ class merge_sets_aggregation final : public aggregation {
 };
 
 /**
- * @brief Derived aggregation class for specifying LEAD/LAG window aggregations
+ * @brief Derived aggregation class for specifying MERGE_VARIANCES aggregation
  */
-class lead_lag_aggregation final : public rolling_aggregation {
+class merge_variances_aggregation final : public std_var_aggregation {
  public:
-  lead_lag_aggregation(Kind kind, size_type offset)
-    : aggregation{offset < 0 ? (kind == LAG ? LEAD : LAG) : kind}, row_offset{std::abs(offset)}
+  explicit merge_variances_aggregation(size_type ddof)
+    : std_var_aggregation{aggregation::MERGE_VARIANCES, ddof}
   {
   }
 
-  bool is_equal(aggregation const& _other) const override
-  {
-    if (!this->aggregation::is_equal(_other)) { return false; }
-    auto const& other = dynamic_cast<lead_lag_aggregation const&>(_other);
-    return (row_offset == other.row_offset);
-  }
-
-  size_t do_hash() const override { return this->aggregation::do_hash() ^ hash_impl(); }
-
   std::unique_ptr<aggregation> clone() const override
   {
-    return std::make_unique<lead_lag_aggregation>(*this);
+    return std::make_unique<merge_variances_aggregation>(*this);
   }
   std::vector<std::unique_ptr<aggregation>> get_simple_aggregations(
     data_type col_type, cudf::detail::simple_aggregations_collector& collector) const override
@@ -722,45 +832,21 @@ class lead_lag_aggregation final : public rolling_aggregation {
     return collector.visit(col_type, *this);
   }
   void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
-
-  size_type row_offset;
-
- private:
-  size_t hash_impl() const { return std::hash<size_type>()(row_offset); }
 };
 
 /**
- * @brief Derived class for specifying a custom aggregation
- * specified in udf
+ * @brief Derived aggregation class for specifying MERGE_STDS aggregation
  */
-class udf_aggregation final : public rolling_aggregation {
+class merge_stds_aggregation final : public std_var_aggregation {
  public:
-  udf_aggregation(aggregation::Kind type,
-                  std::string const& user_defined_aggregator,
-                  data_type output_type)
-    : aggregation{type},
-      _source{user_defined_aggregator},
-      _operator_name{(type == aggregation::PTX) ? "rolling_udf_ptx" : "rolling_udf_cuda"},
-      _function_name{"rolling_udf"},
-      _output_type{output_type}
+  explicit merge_stds_aggregation(size_type ddof)
+    : std_var_aggregation{aggregation::MERGE_STDS, ddof}
   {
-    CUDF_EXPECTS(type == aggregation::PTX or type == aggregation::CUDA,
-                 "udf_aggregation can accept only PTX, CUDA");
-  }
-
-  bool is_equal(aggregation const& _other) const override
-  {
-    if (!this->aggregation::is_equal(_other)) { return false; }
-    auto const& other = dynamic_cast<udf_aggregation const&>(_other);
-    return (_source == other._source and _operator_name == other._operator_name and
-            _function_name == other._function_name and _output_type == other._output_type);
   }
 
-  size_t do_hash() const override { return this->aggregation::do_hash() ^ hash_impl(); }
-
   std::unique_ptr<aggregation> clone() const override
   {
-    return std::make_unique<udf_aggregation>(*this);
+    return std::make_unique<merge_stds_aggregation>(*this);
   }
   std::vector<std::unique_ptr<aggregation>> get_simple_aggregations(
     data_type col_type, cudf::detail::simple_aggregations_collector& collector) const override
@@ -768,19 +854,6 @@ class udf_aggregation final : public rolling_aggregation {
     return collector.visit(col_type, *this);
   }
   void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
-
-  std::string const _source;
-  std::string const _operator_name;
-  std::string const _function_name;
-  data_type _output_type;
-
- protected:
-  size_t hash_impl() const
-  {
-    return std::hash<std::string>{}(_source) ^ std::hash<std::string>{}(_operator_name) ^
-           std::hash<std::string>{}(_function_name) ^
-           std::hash<int>{}(static_cast<int32_t>(_output_type.id()));
-  }
 };
 
 /**
@@ -904,6 +977,12 @@ struct target_type_impl<Source, k, std::enable_if_t<is_chrono<Source>() && is_su
   using type = Source;
 };
 
+// Always use `double` for M2
+template <typename SourceType>
+struct target_type_impl<SourceType, aggregation::M2> {
+  using type = double;
+};
+
 // Always use `double` for VARIANCE
 template <typename SourceType>
 struct target_type_impl<SourceType, aggregation::VARIANCE> {
@@ -970,6 +1049,18 @@ struct target_type_impl<Source, aggregation::COLLECT_SET> {
   using type = cudf::list_view;
 };
 
+// Always use Source for LEAD
+template <typename Source>
+struct target_type_impl<Source, aggregation::LEAD> {
+  using type = Source;
+};
+
+// Always use Source for LAG
+template <typename Source>
+struct target_type_impl<Source, aggregation::LAG> {
+  using type = Source;
+};
+
 // Always use list for MERGE_LISTS
 template <typename Source>
 struct target_type_impl<Source, aggregation::MERGE_LISTS> {
@@ -982,16 +1073,16 @@ struct target_type_impl<Source, aggregation::MERGE_SETS> {
   using type = cudf::list_view;
 };
 
-// Always use Source for LEAD
-template <typename Source>
-struct target_type_impl<Source, aggregation::LEAD> {
-  using type = Source;
+// Always use `double` for MERGE_VARIANCES
+template <typename SourceType>
+struct target_type_impl<SourceType, aggregation::MERGE_VARIANCES> {
+  using type = double;
 };
 
-// Always use Source for LAG
-template <typename Source>
-struct target_type_impl<Source, aggregation::LAG> {
-  using type = Source;
+// Always use `double` for MERGE_STDS
+template <typename SourceType>
+struct target_type_impl<SourceType, aggregation::MERGE_STDS> {
+  using type = double;
 };
 
 /**
@@ -1083,14 +1174,18 @@ CUDA_HOST_DEVICE_CALLABLE decltype(auto) aggregation_dispatcher(aggregation::Kin
       return f.template operator()<aggregation::COLLECT_LIST>(std::forward<Ts>(args)...);
     case aggregation::COLLECT_SET:
       return f.template operator()<aggregation::COLLECT_SET>(std::forward<Ts>(args)...);
-    case aggregation::MERGE_LISTS:
-      return f.template operator()<aggregation::MERGE_LISTS>(std::forward<Ts>(args)...);
-    case aggregation::MERGE_SETS:
-      return f.template operator()<aggregation::MERGE_SETS>(std::forward<Ts>(args)...);
     case aggregation::LEAD:
       return f.template operator()<aggregation::LEAD>(std::forward<Ts>(args)...);
     case aggregation::LAG:
       return f.template operator()<aggregation::LAG>(std::forward<Ts>(args)...);
+    case aggregation::MERGE_LISTS:
+      return f.template operator()<aggregation::MERGE_LISTS>(std::forward<Ts>(args)...);
+    case aggregation::MERGE_SETS:
+      return f.template operator()<aggregation::MERGE_SETS>(std::forward<Ts>(args)...);
+    case aggregation::MERGE_VARIANCES:
+      return f.template operator()<aggregation::MERGE_VARIANCES>(std::forward<Ts>(args)...);
+    case aggregation::MERGE_STDS:
+      return f.template operator()<aggregation::MERGE_STDS>(std::forward<Ts>(args)...);
     default: {
 #ifndef __CUDA_ARCH__
       CUDF_FAIL("Unsupported aggregation.");
diff --git a/cpp/src/aggregation/aggregation.cpp b/cpp/src/aggregation/aggregation.cpp
index f0fd865f685..26340bf6c41 100644
--- a/cpp/src/aggregation/aggregation.cpp
+++ b/cpp/src/aggregation/aggregation.cpp
@@ -88,6 +88,12 @@ std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
   return visit(col_type, static_cast<aggregation const&>(agg));
 }
 
+std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
+  data_type col_type, m2_aggregation const& agg)
+{
+  return visit(col_type, static_cast<aggregation const&>(agg));
+}
+
 std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
   data_type col_type, var_aggregation const& agg)
 {
@@ -154,6 +160,18 @@ std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
   return visit(col_type, static_cast<aggregation const&>(agg));
 }
 
+std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
+  data_type col_type, lead_lag_aggregation const& agg)
+{
+  return visit(col_type, static_cast<aggregation const&>(agg));
+}
+
+std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
+  data_type col_type, udf_aggregation const& agg)
+{
+  return visit(col_type, static_cast<aggregation const&>(agg));
+}
+
 std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
   data_type col_type, merge_lists_aggregation const& agg)
 {
@@ -167,13 +185,13 @@ std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
 }
 
 std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
-  data_type col_type, lead_lag_aggregation const& agg)
+  data_type col_type, merge_variances_aggregation const& agg)
 {
   return visit(col_type, static_cast<aggregation const&>(agg));
 }
 
 std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
-  data_type col_type, udf_aggregation const& agg)
+  data_type col_type, merge_stds_aggregation const& agg)
 {
   return visit(col_type, static_cast<aggregation const&>(agg));
 }
@@ -227,6 +245,11 @@ void aggregation_finalizer::visit(mean_aggregation const& agg)
   visit(static_cast<aggregation const&>(agg));
 }
 
+void aggregation_finalizer::visit(m2_aggregation const& agg)
+{
+  visit(static_cast<aggregation const&>(agg));
+}
+
 void aggregation_finalizer::visit(var_aggregation const& agg)
 {
   visit(static_cast<aggregation const&>(agg));
@@ -282,6 +305,16 @@ void aggregation_finalizer::visit(collect_set_aggregation const& agg)
   visit(static_cast<aggregation const&>(agg));
 }
 
+void aggregation_finalizer::visit(lead_lag_aggregation const& agg)
+{
+  visit(static_cast<aggregation const&>(agg));
+}
+
+void aggregation_finalizer::visit(udf_aggregation const& agg)
+{
+  visit(static_cast<aggregation const&>(agg));
+}
+
 void aggregation_finalizer::visit(merge_lists_aggregation const& agg)
 {
   visit(static_cast<aggregation const&>(agg));
@@ -292,12 +325,12 @@ void aggregation_finalizer::visit(merge_sets_aggregation const& agg)
   visit(static_cast<aggregation const&>(agg));
 }
 
-void aggregation_finalizer::visit(lead_lag_aggregation const& agg)
+void aggregation_finalizer::visit(merge_variances_aggregation const& agg)
 {
   visit(static_cast<aggregation const&>(agg));
 }
 
-void aggregation_finalizer::visit(udf_aggregation const& agg)
+void aggregation_finalizer::visit(merge_stds_aggregation const& agg)
 {
   visit(static_cast<aggregation const&>(agg));
 }
@@ -311,7 +344,7 @@ std::vector<std::unique_ptr<aggregation>> aggregation::get_simple_aggregations(
 }
 
 /// Factory to create a SUM aggregation
-template <typename Base = aggregation>
+template <typename Base>
 std::unique_ptr<Base> make_sum_aggregation()
 {
   return std::make_unique<detail::sum_aggregation>();
@@ -320,7 +353,7 @@ template std::unique_ptr<aggregation> make_sum_aggregation<aggregation>();
 template std::unique_ptr<rolling_aggregation> make_sum_aggregation<rolling_aggregation>();
 
 /// Factory to create a PRODUCT aggregation
-template <typename Base = aggregation>
+template <typename Base>
 std::unique_ptr<Base> make_product_aggregation()
 {
   return std::make_unique<detail::product_aggregation>();
@@ -328,7 +361,7 @@ std::unique_ptr<Base> make_product_aggregation()
 template std::unique_ptr<aggregation> make_product_aggregation<aggregation>();
 
 /// Factory to create a MIN aggregation
-template <typename Base = aggregation>
+template <typename Base>
 std::unique_ptr<Base> make_min_aggregation()
 {
   return std::make_unique<detail::min_aggregation>();
@@ -337,7 +370,7 @@ template std::unique_ptr<aggregation> make_min_aggregation<aggregation>();
 template std::unique_ptr<rolling_aggregation> make_min_aggregation<rolling_aggregation>();
 
 /// Factory to create a MAX aggregation
-template <typename Base = aggregation>
+template <typename Base>
 std::unique_ptr<Base> make_max_aggregation()
 {
   return std::make_unique<detail::max_aggregation>();
@@ -346,7 +379,7 @@ template std::unique_ptr<aggregation> make_max_aggregation<aggregation>();
 template std::unique_ptr<rolling_aggregation> make_max_aggregation<rolling_aggregation>();
 
 /// Factory to create a COUNT aggregation
-template <typename Base = aggregation>
+template <typename Base>
 std::unique_ptr<Base> make_count_aggregation(null_policy null_handling)
 {
   auto kind =
@@ -359,7 +392,7 @@ template std::unique_ptr<rolling_aggregation> make_count_aggregation<rolling_agg
   null_policy null_handling);
 
 /// Factory to create a ANY aggregation
-template <typename Base = aggregation>
+template <typename Base>
 std::unique_ptr<Base> make_any_aggregation()
 {
   return std::make_unique<detail::any_aggregation>();
@@ -367,7 +400,7 @@ std::unique_ptr<Base> make_any_aggregation()
 template std::unique_ptr<aggregation> make_any_aggregation<aggregation>();
 
 /// Factory to create a ALL aggregation
-template <typename Base = aggregation>
+template <typename Base>
 std::unique_ptr<Base> make_all_aggregation()
 {
   return std::make_unique<detail::all_aggregation>();
@@ -375,7 +408,7 @@ std::unique_ptr<Base> make_all_aggregation()
 template std::unique_ptr<aggregation> make_all_aggregation<aggregation>();
 
 /// Factory to create a SUM_OF_SQUARES aggregation
-template <typename Base = aggregation>
+template <typename Base>
 std::unique_ptr<Base> make_sum_of_squares_aggregation()
 {
   return std::make_unique<detail::sum_of_squares_aggregation>();
@@ -383,7 +416,7 @@ std::unique_ptr<Base> make_sum_of_squares_aggregation()
 template std::unique_ptr<aggregation> make_sum_of_squares_aggregation<aggregation>();
 
 /// Factory to create a MEAN aggregation
-template <typename Base = aggregation>
+template <typename Base>
 std::unique_ptr<Base> make_mean_aggregation()
 {
   return std::make_unique<detail::mean_aggregation>();
@@ -391,8 +424,16 @@ std::unique_ptr<Base> make_mean_aggregation()
 template std::unique_ptr<aggregation> make_mean_aggregation<aggregation>();
 template std::unique_ptr<rolling_aggregation> make_mean_aggregation<rolling_aggregation>();
 
+/// Factory to create a M2 aggregation
+template <typename Base>
+std::unique_ptr<Base> make_m2_aggregation()
+{
+  return std::make_unique<detail::m2_aggregation>();
+}
+template std::unique_ptr<aggregation> make_m2_aggregation<aggregation>();
+
 /// Factory to create a VARIANCE aggregation
-template <typename Base = aggregation>
+template <typename Base>
 std::unique_ptr<Base> make_variance_aggregation(size_type ddof)
 {
   return std::make_unique<detail::var_aggregation>(ddof);
@@ -400,7 +441,7 @@ std::unique_ptr<Base> make_variance_aggregation(size_type ddof)
 template std::unique_ptr<aggregation> make_variance_aggregation<aggregation>(size_type ddof);
 
 /// Factory to create a STD aggregation
-template <typename Base = aggregation>
+template <typename Base>
 std::unique_ptr<Base> make_std_aggregation(size_type ddof)
 {
   return std::make_unique<detail::std_aggregation>(ddof);
@@ -408,7 +449,7 @@ std::unique_ptr<Base> make_std_aggregation(size_type ddof)
 template std::unique_ptr<aggregation> make_std_aggregation<aggregation>(size_type ddof);
 
 /// Factory to create a MEDIAN aggregation
-template <typename Base = aggregation>
+template <typename Base>
 std::unique_ptr<Base> make_median_aggregation()
 {
   return std::make_unique<detail::median_aggregation>();
@@ -416,7 +457,7 @@ std::unique_ptr<Base> make_median_aggregation()
 template std::unique_ptr<aggregation> make_median_aggregation<aggregation>();
 
 /// Factory to create a QUANTILE aggregation
-template <typename Base = aggregation>
+template <typename Base>
 std::unique_ptr<Base> make_quantile_aggregation(std::vector<double> const& q, interpolation i)
 {
   return std::make_unique<detail::quantile_aggregation>(q, i);
@@ -425,7 +466,7 @@ template std::unique_ptr<aggregation> make_quantile_aggregation<aggregation>(
   std::vector<double> const& q, interpolation i);
 
 /// Factory to create an ARGMAX aggregation
-template <typename Base = aggregation>
+template <typename Base>
 std::unique_ptr<Base> make_argmax_aggregation()
 {
   return std::make_unique<detail::argmax_aggregation>();
@@ -434,7 +475,7 @@ template std::unique_ptr<aggregation> make_argmax_aggregation<aggregation>();
 template std::unique_ptr<rolling_aggregation> make_argmax_aggregation<rolling_aggregation>();
 
 /// Factory to create an ARGMIN aggregation
-template <typename Base = aggregation>
+template <typename Base>
 std::unique_ptr<Base> make_argmin_aggregation()
 {
   return std::make_unique<detail::argmin_aggregation>();
@@ -443,7 +484,7 @@ template std::unique_ptr<aggregation> make_argmin_aggregation<aggregation>();
 template std::unique_ptr<rolling_aggregation> make_argmin_aggregation<rolling_aggregation>();
 
 /// Factory to create an NUNIQUE aggregation
-template <typename Base = aggregation>
+template <typename Base>
 std::unique_ptr<Base> make_nunique_aggregation(null_policy null_handling)
 {
   return std::make_unique<detail::nunique_aggregation>(null_handling);
@@ -452,7 +493,7 @@ template std::unique_ptr<aggregation> make_nunique_aggregation<aggregation>(
   null_policy null_handling);
 
 /// Factory to create an NTH_ELEMENT aggregation
-template <typename Base = aggregation>
+template <typename Base>
 std::unique_ptr<Base> make_nth_element_aggregation(size_type n, null_policy null_handling)
 {
   return std::make_unique<detail::nth_element_aggregation>(n, null_handling);
@@ -461,7 +502,7 @@ template std::unique_ptr<aggregation> make_nth_element_aggregation<aggregation>(
   size_type n, null_policy null_handling);
 
 /// Factory to create a ROW_NUMBER aggregation
-template <typename Base = aggregation>
+template <typename Base>
 std::unique_ptr<Base> make_row_number_aggregation()
 {
   return std::make_unique<detail::row_number_aggregation>();
@@ -470,7 +511,7 @@ template std::unique_ptr<aggregation> make_row_number_aggregation<aggregation>()
 template std::unique_ptr<rolling_aggregation> make_row_number_aggregation<rolling_aggregation>();
 
 /// Factory to create a COLLECT_LIST aggregation
-template <typename Base = aggregation>
+template <typename Base>
 std::unique_ptr<Base> make_collect_list_aggregation(null_policy null_handling)
 {
   return std::make_unique<detail::collect_list_aggregation>(null_handling);
@@ -481,7 +522,7 @@ template std::unique_ptr<rolling_aggregation> make_collect_list_aggregation<roll
   null_policy null_handling);
 
 /// Factory to create a COLLECT_SET aggregation
-template <typename Base = aggregation>
+template <typename Base>
 std::unique_ptr<Base> make_collect_set_aggregation(null_policy null_handling,
                                                    null_equality nulls_equal,
                                                    nan_equality nans_equal)
@@ -493,26 +534,8 @@ template std::unique_ptr<aggregation> make_collect_set_aggregation<aggregation>(
 template std::unique_ptr<rolling_aggregation> make_collect_set_aggregation<rolling_aggregation>(
   null_policy null_handling, null_equality nulls_equal, nan_equality nans_equal);
 
-/// Factory to create a MERGE_LISTS aggregation
-template <typename Base = aggregation>
-std::unique_ptr<Base> make_merge_lists_aggregation()
-{
-  return std::make_unique<detail::merge_lists_aggregation>();
-}
-template std::unique_ptr<aggregation> make_merge_lists_aggregation<aggregation>();
-
-/// Factory to create a MERGE_SETS aggregation
-template <typename Base = aggregation>
-std::unique_ptr<Base> make_merge_sets_aggregation(null_equality nulls_equal,
-                                                  nan_equality nans_equal)
-{
-  return std::make_unique<detail::merge_sets_aggregation>(nulls_equal, nans_equal);
-}
-template std::unique_ptr<aggregation> make_merge_sets_aggregation<aggregation>(null_equality,
-                                                                               nan_equality);
-
 /// Factory to create a LAG aggregation
-template <typename Base = aggregation>
+template <typename Base>
 std::unique_ptr<Base> make_lag_aggregation(size_type offset)
 {
   return std::make_unique<detail::lead_lag_aggregation>(aggregation::LAG, offset);
@@ -522,7 +545,7 @@ template std::unique_ptr<rolling_aggregation> make_lag_aggregation<rolling_aggre
   size_type offset);
 
 /// Factory to create a LEAD aggregation
-template <typename Base = aggregation>
+template <typename Base>
 std::unique_ptr<Base> make_lead_aggregation(size_type offset)
 {
   return std::make_unique<detail::lead_lag_aggregation>(aggregation::LEAD, offset);
@@ -532,7 +555,7 @@ template std::unique_ptr<rolling_aggregation> make_lead_aggregation<rolling_aggr
   size_type offset);
 
 /// Factory to create a UDF aggregation
-template <typename Base = aggregation>
+template <typename Base>
 std::unique_ptr<Base> make_udf_aggregation(udf_type type,
                                            std::string const& user_defined_aggregator,
                                            data_type output_type)
@@ -548,6 +571,40 @@ template std::unique_ptr<aggregation> make_udf_aggregation<aggregation>(
 template std::unique_ptr<rolling_aggregation> make_udf_aggregation<rolling_aggregation>(
   udf_type type, std::string const& user_defined_aggregator, data_type output_type);
 
+/// Factory to create a MERGE_LISTS aggregation
+template <typename Base>
+std::unique_ptr<Base> make_merge_lists_aggregation()
+{
+  return std::make_unique<detail::merge_lists_aggregation>();
+}
+template std::unique_ptr<aggregation> make_merge_lists_aggregation<aggregation>();
+
+/// Factory to create a MERGE_SETS aggregation
+template <typename Base>
+std::unique_ptr<Base> make_merge_sets_aggregation(null_equality nulls_equal,
+                                                  nan_equality nans_equal)
+{
+  return std::make_unique<detail::merge_sets_aggregation>(nulls_equal, nans_equal);
+}
+template std::unique_ptr<aggregation> make_merge_sets_aggregation<aggregation>(null_equality,
+                                                                               nan_equality);
+
+/// Factory to create a MERGE_VARIANCES aggregation
+template <typename Base>
+std::unique_ptr<Base> make_merge_variances_aggregation(size_type ddof)
+{
+  return std::make_unique<detail::merge_variances_aggregation>(ddof);
+}
+template std::unique_ptr<aggregation> make_merge_variances_aggregation<aggregation>(size_type);
+
+/// Factory to create a MERGE__STD aggregation
+template <typename Base>
+std::unique_ptr<Base> make_merge_stds_aggregation(size_type ddof)
+{
+  return std::make_unique<detail::merge_stds_aggregation>(ddof);
+}
+template std::unique_ptr<aggregation> make_merge_stds_aggregation<aggregation>(size_type);
+
 namespace detail {
 namespace {
 struct target_type_functor {
diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp
index 5e202b9ef3f..473062dc266 100644
--- a/cpp/src/groupby/sort/aggregate.cpp
+++ b/cpp/src/groupby/sort/aggregate.cpp
@@ -239,25 +239,46 @@ void aggregate_result_functor::operator()<aggregation::MEAN>(aggregation const&
 };
 
 template <>
-void aggregate_result_functor::operator()<aggregation::VARIANCE>(aggregation const& agg)
+void aggregate_result_functor::operator()<aggregation::M2>(aggregation const& agg)
 {
   if (cache.has_result(col_idx, agg)) return;
 
-  auto var_agg   = dynamic_cast<cudf::detail::var_aggregation const&>(agg);
-  auto mean_agg  = make_mean_aggregation();
-  auto count_agg = make_count_aggregation();
+  auto mean_agg = make_mean_aggregation();
   operator()<aggregation::MEAN>(*mean_agg);
+  auto const mean_result = cache.get_result(col_idx, *mean_agg);
+
+  cache.add_result(
+    col_idx,
+    agg,
+    detail::group_m2(get_grouped_values(), mean_result, helper.group_labels(stream), stream, mr));
+};
+
+template <>
+void aggregate_result_functor::operator()<aggregation::VARIANCE>(aggregation const& agg)
+{
+  if (cache.has_result(col_idx, agg)) return;
+
+  auto const count_agg = make_count_aggregation();
   operator()<aggregation::COUNT_VALID>(*count_agg);
-  column_view mean_result = cache.get_result(col_idx, *mean_agg);
-  column_view group_sizes = cache.get_result(col_idx, *count_agg);
 
-  auto result = detail::group_var(get_grouped_values(),
-                                  mean_result,
-                                  group_sizes,
-                                  helper.group_labels(stream),
-                                  var_agg._ddof,
-                                  stream,
-                                  mr);
+  auto const group_sizes   = cache.get_result(col_idx, *count_agg);
+  auto const& group_labels = helper.group_labels(stream);
+  auto const ddof          = dynamic_cast<cudf::detail::var_aggregation const&>(agg)._ddof;
+
+  auto result = [&] {
+    if (auto m2_agg = make_m2_aggregation(); cache.has_result(col_idx, *m2_agg)) {
+      auto const group_m2 = cache.get_result(col_idx, *m2_agg);
+      return detail::group_var_from_m2(group_m2, group_sizes, group_labels, ddof, stream, mr);
+    } else {
+      auto mean_agg = make_mean_aggregation();
+      operator()<aggregation::MEAN>(*mean_agg);
+      auto const mean_result = cache.get_result(col_idx, *mean_agg);
+
+      return detail::group_var(
+        get_grouped_values(), mean_result, group_sizes, group_labels, ddof, stream, mr);
+    }
+  }();
+
   cache.add_result(col_idx, agg, std::move(result));
 };
 
@@ -474,6 +495,81 @@ void aggregate_result_functor::operator()<aggregation::MERGE_SETS>(aggregation c
                                                        mr));
 };
 
+/**
+ * @brief Perform merging for the lists that correspond to the same key value.
+ *
+ * This aggregation is similar to `COLLECT_LIST` with the following differences:
+ *  - It requires the input values to be a non-nullable lists column, and
+ *  - The values (lists) corresponding to the same key will not result in a list of lists as output
+ *    from `COLLECT_LIST`. Instead, those lists will result in a list generated by merging them
+ *    together.
+ *
+ * In practice, this aggregation is used to merge the partial results of multiple (distributed)
+ * groupby `COLLECT_LIST` aggregations into a final `COLLECT_LIST` result. Those distributed
+ * aggregations were executed on different values columns partitioned from the original values
+ * column, then their results were (vertically) concatenated before given as the values column for
+ * this aggregation.
+ *
+ * TODO
+ */
+template <>
+void aggregate_result_functor::operator()<aggregation::MERGE_VARIANCES>(aggregation const& agg)
+{
+  if (cache.has_result(col_idx, agg)) { return; }
+
+  cache.add_result(
+    col_idx,
+    agg,
+    detail::group_merge_lists(
+      get_grouped_values(), helper.group_offsets(stream), helper.num_groups(stream), stream, mr));
+};
+
+/**
+ * @brief Perform merging for the lists corresponding to the same key value, then dropping duplicate
+ * list entries.
+ *
+ * This aggregation is similar to `COLLECT_SET` with the following differences:
+ *  - It requires the input values to be a non-nullable lists column, and
+ *  - The values (lists) corresponding to the same key will result in a list generated by merging
+ *    them together then dropping duplicate entries.
+ *
+ * In practice, this aggregation is used to merge the partial results of multiple (distributed)
+ * groupby `COLLECT_LIST` or `COLLECT_SET` aggregations into a final `COLLECT_SET` result. Those
+ * distributed aggregations were executed on different values columns partitioned from the original
+ * values column, then their results were (vertically) concatenated before given as the values
+ * column for this aggregation.
+ *
+ * Firstly, this aggregation performs `MERGE_LISTS` to concatenate the input lists (corresponding to
+ * the same key) into intermediate lists, then it calls `lists::drop_list_duplicates` on them to
+ * remove duplicate list entries. As such, the input (partial results) to this aggregation should be
+ * generated by (distributed) `COLLECT_LIST` aggregations, not `COLLECT_SET`, to avoid unnecessarily
+ * removing duplicate entries for the partial results.
+ *
+ * Since duplicate list entries will be removed, the parameters `null_equality` and `nan_equality`
+ * are needed for calling to `lists::drop_list_duplicates`.
+ *
+ * TODO
+ */
+template <>
+void aggregate_result_functor::operator()<aggregation::MERGE_STDS>(aggregation const& agg)
+{
+  if (cache.has_result(col_idx, agg)) { return; }
+
+  auto const merged_result  = detail::group_merge_lists(get_grouped_values(),
+                                                       helper.group_offsets(stream),
+                                                       helper.num_groups(stream),
+                                                       stream,
+                                                       rmm::mr::get_current_device_resource());
+  auto const merge_sets_agg = dynamic_cast<cudf::detail::merge_sets_aggregation const&>(agg);
+  cache.add_result(col_idx,
+                   agg,
+                   lists::detail::drop_list_duplicates(lists_column_view(merged_result->view()),
+                                                       merge_sets_agg._nulls_equal,
+                                                       merge_sets_agg._nans_equal,
+                                                       stream,
+                                                       mr));
+};
+
 }  // namespace detail
 
 // Sort-based groupby
diff --git a/cpp/src/groupby/sort/group_merge_variances.cu b/cpp/src/groupby/sort/group_merge_variances.cu
new file mode 100644
index 00000000000..329f22431b1
--- /dev/null
+++ b/cpp/src/groupby/sort/group_merge_variances.cu
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column_factories.hpp>
+#include <cudf/lists/lists_column_view.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/gather.h>
+
+namespace cudf {
+namespace groupby {
+namespace detail {
+std::unique_ptr<column> group_merge_variances(column_view const& values,
+                                              cudf::device_span<size_type const> group_offsets,
+                                              size_type num_groups,
+                                              rmm::cuda_stream_view stream,
+                                              rmm::mr::device_memory_resource* mr)
+{
+  CUDF_EXPECTS(values.type().id() == type_id::LIST,
+               "Input to `group_merge_lists` must be a lists column.");
+  CUDF_EXPECTS(!values.nullable(),
+               "Input to `group_merge_lists` must be a non-nullable lists column.");
+
+  auto offsets_column = make_numeric_column(
+    data_type(type_to_id<offset_type>()), num_groups + 1, mask_state::UNALLOCATED, stream, mr);
+
+  // Generate offsets of the output lists column by gathering from the provided group offsets and
+  // the input list offsets.
+  //
+  // For example:
+  //   values        = [[2, 1], [], [4, -1, -2], [], [<NA>, 4, <NA>]]
+  //   list_offsets  =  [0,     2,   2,           5,   5              8]
+  //   group_offsets = [0,                        3,                  5]
+  //
+  //   then, the output offsets_column is [0, 5, 8].
+  //
+  thrust::gather(rmm::exec_policy(stream),
+                 group_offsets.begin(),
+                 group_offsets.end(),
+                 lists_column_view(values).offsets_begin(),
+                 offsets_column->mutable_view().template begin<offset_type>());
+
+  // The child column of the output lists column is just copied from the input column.
+  auto child_column =
+    std::make_unique<column>(lists_column_view(values).get_sliced_child(stream), stream, mr);
+
+  return make_lists_column(num_groups,
+                           std::move(offsets_column),
+                           std::move(child_column),
+                           0,
+                           rmm::device_buffer{},
+                           stream,
+                           mr);
+}
+
+}  // namespace detail
+}  // namespace groupby
+}  // namespace cudf
diff --git a/cpp/src/groupby/sort/group_reductions.hpp b/cpp/src/groupby/sort/group_reductions.hpp
index 3390af29330..78492f0dd9b 100644
--- a/cpp/src/groupby/sort/group_reductions.hpp
+++ b/cpp/src/groupby/sort/group_reductions.hpp
@@ -217,6 +217,47 @@ std::unique_ptr<column> group_count_all(cudf::device_span<size_type const> group
                                         rmm::cuda_stream_view stream,
                                         rmm::mr::device_memory_resource* mr);
 
+/**
+ * @brief Internal API to calculate groupwise sum of squares of differences from the current mean
+ *
+ * @code{.pseudo}
+ * values       = [2, 1, 4, -1, -2, <NA>, 4, <NA>]
+ * group_labels = [0, 0, 0,  1,  1,    2, 2,    3]
+ * group_means  = [2.333333, -1.5, 4.0, <NA>]
+ *
+ * group_m2     = [4.666666, 1.0, 0, <NA>]
+ * @endcode
+ *
+ * @param values Grouped values to get M2 of
+ * @param group_means Pre-calculated groupwise MEAN
+ * @param group_labels ID of group corresponding value in @p values belongs to
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
+std::unique_ptr<column> group_m2(column_view const& values,
+                                 column_view const& group_means,
+                                 cudf::device_span<size_type const> group_labels,
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource* mr);
+
+/**
+ * @brief group_var
+ * @param values
+ * @param group_means
+ * @param group_sizes
+ * @param group_labels
+ * @param ddof
+ * @param stream
+ * @param mr
+ * @return
+ */
+std::unique_ptr<column> group_var_from_m2(column_view const& group_m2,
+                                          column_view const& group_sizes,
+                                          cudf::device_span<size_type const> group_labels,
+                                          size_type ddof,
+                                          rmm::cuda_stream_view stream,
+                                          rmm::mr::device_memory_resource* mr);
+
 /**
  * @brief Internal API to calculate groupwise variance
  *
@@ -392,6 +433,30 @@ std::unique_ptr<column> group_merge_lists(column_view const& values,
                                           rmm::cuda_stream_view stream,
                                           rmm::mr::device_memory_resource* mr);
 
+// todo
+/**
+ * @brief Internal API to merge grouped variances into one variance value.
+ *
+ * @code{.pseudo}
+ * values        = [[2, 1], [], [4, -1, -2], [], [<NA>, 4, <NA>]]
+ * group_offsets = [0,                        3,                  5]
+ * num_groups    = 2
+ *
+ * group_merge_lists(...) = [[2, 1, 4, -1, -2], [<NA>, 4, <NA>]]
+ * @endcode
+ *
+ * @param values Grouped values (lists column) to collect.
+ * @param group_offsets Offsets of groups' starting points within @p values.
+ * @param num_groups Number of groups.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ */
+std::unique_ptr<column> group_merge_variances(column_view const& values,
+                                              cudf::device_span<size_type const> group_offsets,
+                                              size_type num_groups,
+                                              rmm::cuda_stream_view stream,
+                                              rmm::mr::device_memory_resource* mr);
+
 /** @endinternal
  *
  */
diff --git a/cpp/src/groupby/sort/group_std.cu b/cpp/src/groupby/sort/group_std.cu
index 9ebb516ee14..c63db6c0c4e 100644
--- a/cpp/src/groupby/sort/group_std.cu
+++ b/cpp/src/groupby/sort/group_std.cu
@@ -49,16 +49,22 @@ struct var_transform {
   {
     if (d_values.is_null(i)) return 0.0;
 
-    ResultType x = static_cast<ResultType>(values_iter[i]);
+    auto const x         = static_cast<ResultType>(values_iter[i]);
+    auto const group_idx = d_group_labels[i];
 
-    size_type group_idx  = d_group_labels[i];
-    size_type group_size = d_group_sizes[group_idx];
+    if (d_group_sizes) {  // for variance/std
+      auto const group_size = d_group_sizes[group_idx];
 
-    // prevent divide by zero error
-    if (group_size == 0 or group_size - ddof <= 0) return 0.0;
+      // prevent divide by zero error
+      if (group_size == 0 or group_size - ddof <= 0) return 0.0;
 
-    ResultType mean = d_means[group_idx];
-    return (x - mean) * (x - mean) / (group_size - ddof);
+      auto const mean = d_means[group_idx];
+      return (x - mean) * (x - mean) / (group_size - ddof);
+    } else {  // for m2
+      auto const mean = d_means[group_idx];
+      auto const diff = x - mean;
+      return diff * diff;
+    }
   }
 };
 
@@ -85,6 +91,50 @@ void reduce_by_key_fn(column_device_view const& values,
                         d_result);
 }
 
+struct m2_functor {
+  template <typename T>
+  std::enable_if_t<std::is_arithmetic<T>::value, std::unique_ptr<column>> operator()(
+    column_view const& values,
+    column_view const& group_means,
+    cudf::device_span<size_type const> group_labels,
+    rmm::cuda_stream_view stream,
+    rmm::mr::device_memory_resource* mr)
+  {
+// Running this in debug build causes a runtime error:
+// `reduce_by_key failed on 2nd step: invalid device function`
+#if !defined(__CUDACC_DEBUG__)
+    using ResultType = cudf::detail::target_type_t<T, aggregation::Kind::M2>;
+
+    auto result = make_numeric_column(
+      data_type(type_to_id<ResultType>()), values.size(), mask_state::UNINITIALIZED, stream, mr);
+
+    auto const values_dv_ptr = column_device_view::create(values, stream);
+    auto const values_dv     = *values_dv_ptr;
+
+    auto d_means  = group_means.data<ResultType>();
+    auto d_result = result->mutable_view().data<ResultType>();
+
+    if (!cudf::is_dictionary(values.type())) {
+      auto const values_iter = values_dv.begin<T>();
+      reduce_by_key_fn(values_dv, values_iter, group_labels, d_means, nullptr, 0, d_result, stream);
+    } else {
+      auto values_iter = cudf::dictionary::detail::make_dictionary_iterator<T>(*values_dv_ptr);
+      reduce_by_key_fn(values_dv, values_iter, group_labels, d_means, nullptr, 0, d_result, stream);
+    }
+
+    return result;
+#else
+    CUDF_FAIL("Groupby m2 aggregation is not supported in debug build");
+#endif
+  }
+
+  template <typename T, typename... Args>
+  std::enable_if_t<!std::is_arithmetic<T>::value, std::unique_ptr<column>> operator()(Args&&...)
+  {
+    CUDF_FAIL("Only numeric types are supported in groupby m2 aggregation");
+  }
+};
+
 struct var_functor {
   template <typename T>
   std::enable_if_t<std::is_arithmetic<T>::value, std::unique_ptr<column>> operator()(
@@ -152,6 +202,30 @@ struct var_functor {
 
 }  // namespace
 
+std::unique_ptr<column> group_m2(column_view const& values,
+                                 column_view const& group_means,
+                                 cudf::device_span<size_type const> group_labels,
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource* mr)
+{
+  auto values_type = cudf::is_dictionary(values.type())
+                       ? dictionary_column_view(values).keys().type()
+                       : values.type();
+
+  return type_dispatcher(values_type, m2_functor{}, values, group_means, group_labels, stream, mr);
+}
+
+std::unique_ptr<column> group_var_from_m2(column_view const& group_m2,
+                                          column_view const& group_sizes,
+                                          cudf::device_span<size_type const> group_labels,
+                                          size_type ddof,
+                                          rmm::cuda_stream_view stream,
+                                          rmm::mr::device_memory_resource* mr)
+{
+  // TODO
+  return std::make_unique<column>();
+}
+
 std::unique_ptr<column> group_var(column_view const& values,
                                   column_view const& group_means,
                                   column_view const& group_sizes,

From ebc718a2a82bba9a6d5879d72ff1f011171daba4 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Fri, 25 Jun 2021 13:33:38 -0600
Subject: [PATCH 03/19] Remove `MERGE_VARIANCES` and `MERGE_STDS`, add
 `MERGE_M2`

---
 cpp/CMakeLists.txt                            |   4 +-
 cpp/include/cudf/aggregation.hpp              | 101 +++++++-----------
 .../cudf/detail/aggregation/aggregation.hpp   |  61 +++--------
 cpp/src/aggregation/aggregation.cpp           |  31 ++----
 cpp/src/groupby/sort/aggregate.cpp            |  70 ++----------
 ...p_merge_variances.cu => group_merge_m2.cu} |  10 +-
 cpp/src/groupby/sort/group_reductions.hpp     |  12 +--
 7 files changed, 76 insertions(+), 213 deletions(-)
 rename cpp/src/groupby/sort/{group_merge_variances.cu => group_merge_m2.cu} (86%)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 05c3028067e..c6d56f7075b 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -199,7 +199,7 @@ add_library(cudf
     src/groupby/sort/aggregate.cpp
     src/groupby/sort/group_collect.cu
     src/groupby/sort/group_merge_lists.cu
-    src/groupby/sort/group_merge_variances.cu
+    src/groupby/sort/group_merge_m2.cu
     src/groupby/sort/group_count.cu
     src/groupby/sort/group_max.cu
     src/groupby/sort/group_min.cu
@@ -273,7 +273,7 @@ add_library(cudf
     src/join/join.cu
     src/join/semi_join.cu
     src/lists/contains.cu
-    src/lists/combine/concatenate_list_elements.cu		
+    src/lists/combine/concatenate_list_elements.cu
     src/lists/combine/concatenate_rows.cu
     src/lists/copying/concatenate.cu
     src/lists/copying/copying.cu
diff --git a/cpp/include/cudf/aggregation.hpp b/cpp/include/cudf/aggregation.hpp
index 5913d39e020..8602b7086f1 100644
--- a/cpp/include/cudf/aggregation.hpp
+++ b/cpp/include/cudf/aggregation.hpp
@@ -57,36 +57,35 @@ class aggregation {
    * @brief Possible aggregation operations
    */
   enum Kind {
-    SUM,              ///< sum reduction
-    PRODUCT,          ///< product reduction
-    MIN,              ///< min reduction
-    MAX,              ///< max reduction
-    COUNT_VALID,      ///< count number of valid elements
-    COUNT_ALL,        ///< count number of elements
-    ANY,              ///< any reduction
-    ALL,              ///< all reduction
-    SUM_OF_SQUARES,   ///< sum of squares reduction
-    MEAN,             ///< arithmetic mean reduction
-    M2,               ///< groupwise sum of squares of differences from the current mean
-    VARIANCE,         ///< groupwise variance
-    STD,              ///< groupwise standard deviation
-    MEDIAN,           ///< median reduction
-    QUANTILE,         ///< compute specified quantile(s)
-    ARGMAX,           ///< Index of max element
-    ARGMIN,           ///< Index of min element
-    NUNIQUE,          ///< count number of unique elements
-    NTH_ELEMENT,      ///< get the nth element
-    ROW_NUMBER,       ///< get row-number of current index (relative to rolling window)
-    COLLECT_LIST,     ///< collect values into a list
-    COLLECT_SET,      ///< collect values into a list without duplicate entries
-    LEAD,             ///< window function, accesses row at specified offset following current row
-    LAG,              ///< window function, accesses row at specified offset preceding current row
-    PTX,              ///< PTX  UDF based reduction
-    CUDA,             ///< CUDA UDF based reduction
-    MERGE_LISTS,      ///< merge multiple lists values into one list
-    MERGE_SETS,       ///< merge multiple lists values into one list then drop duplicate entries
-    MERGE_VARIANCES,  ///< merge partial variance values
-    MERGE_STDS        ///< merge partial standard deviation values
+    SUM,             ///< sum reduction
+    PRODUCT,         ///< product reduction
+    MIN,             ///< min reduction
+    MAX,             ///< max reduction
+    COUNT_VALID,     ///< count number of valid elements
+    COUNT_ALL,       ///< count number of elements
+    ANY,             ///< any reduction
+    ALL,             ///< all reduction
+    SUM_OF_SQUARES,  ///< sum of squares reduction
+    MEAN,            ///< arithmetic mean reduction
+    M2,              ///< groupwise sum of squares of differences from the group mean
+    VARIANCE,        ///< groupwise variance
+    STD,             ///< groupwise standard deviation
+    MEDIAN,          ///< median reduction
+    QUANTILE,        ///< compute specified quantile(s)
+    ARGMAX,          ///< Index of max element
+    ARGMIN,          ///< Index of min element
+    NUNIQUE,         ///< count number of unique elements
+    NTH_ELEMENT,     ///< get the nth element
+    ROW_NUMBER,      ///< get row-number of current index (relative to rolling window)
+    COLLECT_LIST,    ///< collect values into a list
+    COLLECT_SET,     ///< collect values into a list without duplicate entries
+    LEAD,            ///< window function, accesses row at specified offset following current row
+    LAG,             ///< window function, accesses row at specified offset preceding current row
+    PTX,             ///< PTX  UDF based reduction
+    CUDA,            ///< CUDA UDF based reduction
+    MERGE_LISTS,     ///< merge multiple lists values into one list
+    MERGE_SETS,      ///< merge multiple lists values into one list then drop duplicate entries
+    MERGE_M2         ///< merge partial values of M2 aggregation
   };
 
   aggregation() = delete;
@@ -165,7 +164,7 @@ std::unique_ptr<Base> make_mean_aggregation();
 /**
  * @brief Factory to create a M2 aggregation
  *
- * A M2 aggregation is groupwise sum of squares of differences from the current mean. From this,
+ * A M2 aggregation is groupwise sum of squares of differences from the group mean. From this,
  * a `VARIANCE` aggregation can be computed as `M2 / (N - ddof)`, where `N` is the population size
  * and `ddof` is the delta degrees of freedom.
  */
@@ -344,43 +343,17 @@ std::unique_ptr<Base> make_merge_sets_aggregation(null_equality nulls_equal = nu
                                                   nan_equality nans_equal = nan_equality::UNEQUAL);
 
 /**
- * @brief Factory to create a MERGE_VARIANCES aggregation
+ * @brief Factory to create a MERGE_M2 aggregation
  *
- * This aggregation is designed specificly to perform distributed computing of `VARIANCE`
- * aggregation. The partial results input to this aggregation is generated by two groupby
- * aggregations: `VARIANCE` and `COUNT_VALID`.
+ * This aggregation is designed specificly to perform distributed computing of `M2`
+ * aggregation. The partial results input to this aggregation is a structs column with children are
+ * columns generated by three groupby aggregations: `SUM`, `COUNT_VALID`, and `M2`.
  *
- * In order to use this aggregation, the `aggregation_request` array input to `groupby::aggregate`
- * must contain at least two requests:
- *  - A request for `COLLECT_LIST` aggregation to collect the partial results of `COUNT_VALID`
- *  - This `MERGE_VARIANCES` request, which must be given AFTER the request above so that it can
- *    access the cached results generated by that request
- *
- * For a merging operation that is not a final merge (i.e., its outputs will be used as input to
- * perform another `MERGE_VARIANCES` aggregation), a `SUM` aggregation must also be added to the
- * same request for `COLLECT_LIST` above to produce the merged values for `COUNT_VALID`.
- *
- * Since the partial results output from `VARIANCE` and `COUNT_VALID` do not contain nulls, the
- * input values columns to those two requests must be non-nullable.
- *
- * @param ddof Delta degrees of freedom. The divisor used in calculation of `variance` is
- *        `N - ddof`, where `N` is the population size.
- */
-template <typename Base = aggregation>
-std::unique_ptr<Base> make_merge_variances_aggregation(size_type ddof = 1);
-
-/**
- * @brief Factory to create a MERGE_STDS aggregation
- *
- * This aggregation is designed specificly to perform distributed computing of `STD`
- * aggregation. The partial results input to this aggregation and its usage are the same as of
- * `MERGE_VARIANCES` aggregation.
- *
- * @param ddof Delta degrees of freedom. The divisor used in calculation of `variance` is
- *        `N - ddof`, where `N` is the population size.
+ * Since the partial results output from those aggregations do not contain nulls, the input structs
+ * column and its children must all be non-nullable.
  */
 template <typename Base = aggregation>
-std::unique_ptr<Base> make_merge_stds_aggregation(size_type ddof = 1);
+std::unique_ptr<Base> make_merge_m2_aggregation();
 
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp
index ab2ef947cba..d4176e1ab6c 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.hpp
+++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp
@@ -85,10 +85,8 @@ class simple_aggregations_collector {  // Declares the interface for the simple
                                                           class merge_lists_aggregation const& agg);
   virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
                                                           class merge_sets_aggregation const& agg);
-  virtual std::vector<std::unique_ptr<aggregation>> visit(
-    data_type col_type, class merge_variances_aggregation const& agg);
   virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
-                                                          class merge_stds_aggregation const& agg);
+                                                          class merge_m2_aggregation const& agg);
 };
 
 class aggregation_finalizer {  // Declares the interface for the finalizer
@@ -120,8 +118,7 @@ class aggregation_finalizer {  // Declares the interface for the finalizer
   virtual void visit(class udf_aggregation const& agg);
   virtual void visit(class merge_lists_aggregation const& agg);
   virtual void visit(class merge_sets_aggregation const& agg);
-  virtual void visit(class merge_variances_aggregation const& agg);
-  virtual void visit(class merge_stds_aggregation const& agg);
+  virtual void visit(class merge_m2_aggregation const& agg);
 };
 
 /**
@@ -333,9 +330,8 @@ class std_var_aggregation : public aggregation {
  protected:
   std_var_aggregation(aggregation::Kind k, size_type ddof) : aggregation(k), _ddof{ddof}
   {
-    CUDF_EXPECTS(k == aggregation::STD or k == aggregation::VARIANCE or
-                   k == aggregation::MERGE_STDS or k == aggregation::MERGE_VARIANCES,
-                 "std_var_aggregation can accept only STD, VARIANCE, MERGE_STDS, MERGE_VARIANCES");
+    CUDF_EXPECTS(k == aggregation::STD or k == aggregation::VARIANCE,
+                 "std_var_aggregation can accept only STD, VARIANCE");
   }
 
   size_type hash_impl() const { return std::hash<size_type>{}(_ddof); }
@@ -813,40 +809,15 @@ class merge_sets_aggregation final : public aggregation {
 };
 
 /**
- * @brief Derived aggregation class for specifying MERGE_VARIANCES aggregation
- */
-class merge_variances_aggregation final : public std_var_aggregation {
- public:
-  explicit merge_variances_aggregation(size_type ddof)
-    : std_var_aggregation{aggregation::MERGE_VARIANCES, ddof}
-  {
-  }
-
-  std::unique_ptr<aggregation> clone() const override
-  {
-    return std::make_unique<merge_variances_aggregation>(*this);
-  }
-  std::vector<std::unique_ptr<aggregation>> get_simple_aggregations(
-    data_type col_type, cudf::detail::simple_aggregations_collector& collector) const override
-  {
-    return collector.visit(col_type, *this);
-  }
-  void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
-};
-
-/**
- * @brief Derived aggregation class for specifying MERGE_STDS aggregation
+ * @brief Derived aggregation class for specifying MERGE_M2 aggregation
  */
-class merge_stds_aggregation final : public std_var_aggregation {
+class merge_m2_aggregation final : public aggregation {
  public:
-  explicit merge_stds_aggregation(size_type ddof)
-    : std_var_aggregation{aggregation::MERGE_STDS, ddof}
-  {
-  }
+  explicit merge_m2_aggregation() : aggregation{MERGE_M2} {}
 
   std::unique_ptr<aggregation> clone() const override
   {
-    return std::make_unique<merge_stds_aggregation>(*this);
+    return std::make_unique<merge_m2_aggregation>(*this);
   }
   std::vector<std::unique_ptr<aggregation>> get_simple_aggregations(
     data_type col_type, cudf::detail::simple_aggregations_collector& collector) const override
@@ -1073,15 +1044,9 @@ struct target_type_impl<Source, aggregation::MERGE_SETS> {
   using type = cudf::list_view;
 };
 
-// Always use `double` for MERGE_VARIANCES
-template <typename SourceType>
-struct target_type_impl<SourceType, aggregation::MERGE_VARIANCES> {
-  using type = double;
-};
-
-// Always use `double` for MERGE_STDS
+// Always use `double` for MERGE_M2
 template <typename SourceType>
-struct target_type_impl<SourceType, aggregation::MERGE_STDS> {
+struct target_type_impl<SourceType, aggregation::MERGE_M2> {
   using type = double;
 };
 
@@ -1182,10 +1147,8 @@ CUDA_HOST_DEVICE_CALLABLE decltype(auto) aggregation_dispatcher(aggregation::Kin
       return f.template operator()<aggregation::MERGE_LISTS>(std::forward<Ts>(args)...);
     case aggregation::MERGE_SETS:
       return f.template operator()<aggregation::MERGE_SETS>(std::forward<Ts>(args)...);
-    case aggregation::MERGE_VARIANCES:
-      return f.template operator()<aggregation::MERGE_VARIANCES>(std::forward<Ts>(args)...);
-    case aggregation::MERGE_STDS:
-      return f.template operator()<aggregation::MERGE_STDS>(std::forward<Ts>(args)...);
+    case aggregation::MERGE_M2:
+      return f.template operator()<aggregation::MERGE_M2>(std::forward<Ts>(args)...);
     default: {
 #ifndef __CUDA_ARCH__
       CUDF_FAIL("Unsupported aggregation.");
diff --git a/cpp/src/aggregation/aggregation.cpp b/cpp/src/aggregation/aggregation.cpp
index 26340bf6c41..53a55351f8e 100644
--- a/cpp/src/aggregation/aggregation.cpp
+++ b/cpp/src/aggregation/aggregation.cpp
@@ -185,13 +185,7 @@ std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
 }
 
 std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
-  data_type col_type, merge_variances_aggregation const& agg)
-{
-  return visit(col_type, static_cast<aggregation const&>(agg));
-}
-
-std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
-  data_type col_type, merge_stds_aggregation const& agg)
+  data_type col_type, merge_m2_aggregation const& agg)
 {
   return visit(col_type, static_cast<aggregation const&>(agg));
 }
@@ -325,12 +319,7 @@ void aggregation_finalizer::visit(merge_sets_aggregation const& agg)
   visit(static_cast<aggregation const&>(agg));
 }
 
-void aggregation_finalizer::visit(merge_variances_aggregation const& agg)
-{
-  visit(static_cast<aggregation const&>(agg));
-}
-
-void aggregation_finalizer::visit(merge_stds_aggregation const& agg)
+void aggregation_finalizer::visit(merge_m2_aggregation const& agg)
 {
   visit(static_cast<aggregation const&>(agg));
 }
@@ -589,21 +578,13 @@ std::unique_ptr<Base> make_merge_sets_aggregation(null_equality nulls_equal,
 template std::unique_ptr<aggregation> make_merge_sets_aggregation<aggregation>(null_equality,
                                                                                nan_equality);
 
-/// Factory to create a MERGE_VARIANCES aggregation
-template <typename Base>
-std::unique_ptr<Base> make_merge_variances_aggregation(size_type ddof)
-{
-  return std::make_unique<detail::merge_variances_aggregation>(ddof);
-}
-template std::unique_ptr<aggregation> make_merge_variances_aggregation<aggregation>(size_type);
-
-/// Factory to create a MERGE__STD aggregation
+/// Factory to create a MERGE_M2 aggregation
 template <typename Base>
-std::unique_ptr<Base> make_merge_stds_aggregation(size_type ddof)
+std::unique_ptr<Base> make_merge_m2_aggregation()
 {
-  return std::make_unique<detail::merge_stds_aggregation>(ddof);
+  return std::make_unique<detail::merge_m2_aggregation>();
 }
-template std::unique_ptr<aggregation> make_merge_stds_aggregation<aggregation>(size_type);
+template std::unique_ptr<aggregation> make_merge_m2_aggregation<aggregation>();
 
 namespace detail {
 namespace {
diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp
index 473062dc266..8c12aa57f6d 100644
--- a/cpp/src/groupby/sort/aggregate.cpp
+++ b/cpp/src/groupby/sort/aggregate.cpp
@@ -496,80 +496,26 @@ void aggregate_result_functor::operator()<aggregation::MERGE_SETS>(aggregation c
 };
 
 /**
- * @brief Perform merging for the lists that correspond to the same key value.
- *
- * This aggregation is similar to `COLLECT_LIST` with the following differences:
- *  - It requires the input values to be a non-nullable lists column, and
- *  - The values (lists) corresponding to the same key will not result in a list of lists as output
- *    from `COLLECT_LIST`. Instead, those lists will result in a list generated by merging them
- *    together.
+ * @brief Perform merging for the M2 values that correspond to the same key value.
  *
- * In practice, this aggregation is used to merge the partial results of multiple (distributed)
- * groupby `COLLECT_LIST` aggregations into a final `COLLECT_LIST` result. Those distributed
- * aggregations were executed on different values columns partitioned from the original values
- * column, then their results were (vertically) concatenated before given as the values column for
- * this aggregation.
- *
- * TODO
+ * The partial results input to this aggregation is a structs column with children are
+ * columns generated by three groupby aggregations: `SUM`, `COUNT_VALID`, and `M2` performed on
+ * partitioned datasets. After distributedly computed, the results output from these aggregations
+ * are (vertically) concatenated before assembling into a structs column given as the values column
+ * for this aggregation.
  */
 template <>
-void aggregate_result_functor::operator()<aggregation::MERGE_VARIANCES>(aggregation const& agg)
+void aggregate_result_functor::operator()<aggregation::MERGE_M2>(aggregation const& agg)
 {
   if (cache.has_result(col_idx, agg)) { return; }
 
   cache.add_result(
     col_idx,
     agg,
-    detail::group_merge_lists(
+    detail::group_merge_m2(
       get_grouped_values(), helper.group_offsets(stream), helper.num_groups(stream), stream, mr));
 };
 
-/**
- * @brief Perform merging for the lists corresponding to the same key value, then dropping duplicate
- * list entries.
- *
- * This aggregation is similar to `COLLECT_SET` with the following differences:
- *  - It requires the input values to be a non-nullable lists column, and
- *  - The values (lists) corresponding to the same key will result in a list generated by merging
- *    them together then dropping duplicate entries.
- *
- * In practice, this aggregation is used to merge the partial results of multiple (distributed)
- * groupby `COLLECT_LIST` or `COLLECT_SET` aggregations into a final `COLLECT_SET` result. Those
- * distributed aggregations were executed on different values columns partitioned from the original
- * values column, then their results were (vertically) concatenated before given as the values
- * column for this aggregation.
- *
- * Firstly, this aggregation performs `MERGE_LISTS` to concatenate the input lists (corresponding to
- * the same key) into intermediate lists, then it calls `lists::drop_list_duplicates` on them to
- * remove duplicate list entries. As such, the input (partial results) to this aggregation should be
- * generated by (distributed) `COLLECT_LIST` aggregations, not `COLLECT_SET`, to avoid unnecessarily
- * removing duplicate entries for the partial results.
- *
- * Since duplicate list entries will be removed, the parameters `null_equality` and `nan_equality`
- * are needed for calling to `lists::drop_list_duplicates`.
- *
- * TODO
- */
-template <>
-void aggregate_result_functor::operator()<aggregation::MERGE_STDS>(aggregation const& agg)
-{
-  if (cache.has_result(col_idx, agg)) { return; }
-
-  auto const merged_result  = detail::group_merge_lists(get_grouped_values(),
-                                                       helper.group_offsets(stream),
-                                                       helper.num_groups(stream),
-                                                       stream,
-                                                       rmm::mr::get_current_device_resource());
-  auto const merge_sets_agg = dynamic_cast<cudf::detail::merge_sets_aggregation const&>(agg);
-  cache.add_result(col_idx,
-                   agg,
-                   lists::detail::drop_list_duplicates(lists_column_view(merged_result->view()),
-                                                       merge_sets_agg._nulls_equal,
-                                                       merge_sets_agg._nans_equal,
-                                                       stream,
-                                                       mr));
-};
-
 }  // namespace detail
 
 // Sort-based groupby
diff --git a/cpp/src/groupby/sort/group_merge_variances.cu b/cpp/src/groupby/sort/group_merge_m2.cu
similarity index 86%
rename from cpp/src/groupby/sort/group_merge_variances.cu
rename to cpp/src/groupby/sort/group_merge_m2.cu
index 329f22431b1..4ffc29c2a56 100644
--- a/cpp/src/groupby/sort/group_merge_variances.cu
+++ b/cpp/src/groupby/sort/group_merge_m2.cu
@@ -26,11 +26,11 @@
 namespace cudf {
 namespace groupby {
 namespace detail {
-std::unique_ptr<column> group_merge_variances(column_view const& values,
-                                              cudf::device_span<size_type const> group_offsets,
-                                              size_type num_groups,
-                                              rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> group_merge_m2(column_view const& values,
+                                       cudf::device_span<size_type const> group_offsets,
+                                       size_type num_groups,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(values.type().id() == type_id::LIST,
                "Input to `group_merge_lists` must be a lists column.");
diff --git a/cpp/src/groupby/sort/group_reductions.hpp b/cpp/src/groupby/sort/group_reductions.hpp
index 78492f0dd9b..032973481a0 100644
--- a/cpp/src/groupby/sort/group_reductions.hpp
+++ b/cpp/src/groupby/sort/group_reductions.hpp
@@ -435,7 +435,7 @@ std::unique_ptr<column> group_merge_lists(column_view const& values,
 
 // todo
 /**
- * @brief Internal API to merge grouped variances into one variance value.
+ * @brief Internal API to merge grouped m2 values corresponding to the same key.
  *
  * @code{.pseudo}
  * values        = [[2, 1], [], [4, -1, -2], [], [<NA>, 4, <NA>]]
@@ -451,11 +451,11 @@ std::unique_ptr<column> group_merge_lists(column_view const& values,
  * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  */
-std::unique_ptr<column> group_merge_variances(column_view const& values,
-                                              cudf::device_span<size_type const> group_offsets,
-                                              size_type num_groups,
-                                              rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr);
+std::unique_ptr<column> group_merge_m2(column_view const& values,
+                                       cudf::device_span<size_type const> group_offsets,
+                                       size_type num_groups,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::mr::device_memory_resource* mr);
 
 /** @endinternal
  *

From 93b38c41bf965bf93bc4551d5fab655f1011d18a Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Wed, 30 Jun 2021 13:05:47 -0600
Subject: [PATCH 04/19] Finish implementation, no unit tests implemented yet

---
 cpp/CMakeLists.txt                        |   7 +-
 cpp/include/cudf/aggregation.hpp          |  11 +-
 cpp/src/groupby/sort/aggregate.cpp        |  44 ++++---
 cpp/src/groupby/sort/group_m2.cu          | 140 ++++++++++++++++++++++
 cpp/src/groupby/sort/group_merge_m2.cu    | 128 ++++++++++++++------
 cpp/src/groupby/sort/group_reductions.hpp |  53 +++-----
 cpp/src/groupby/sort/group_std.cu         |  88 ++------------
 7 files changed, 284 insertions(+), 187 deletions(-)
 create mode 100644 cpp/src/groupby/sort/group_m2.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index c6d56f7075b..adc62494382 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -194,15 +194,16 @@ add_library(cudf
     src/filling/sequence.cu
     src/groupby/groupby.cu
     src/groupby/hash/groupby.cu
+    src/groupby/sort/aggregate.cpp
     src/groupby/sort/group_argmax.cu
     src/groupby/sort/group_argmin.cu
-    src/groupby/sort/aggregate.cpp
     src/groupby/sort/group_collect.cu
-    src/groupby/sort/group_merge_lists.cu
-    src/groupby/sort/group_merge_m2.cu
     src/groupby/sort/group_count.cu
+    src/groupby/sort/group_m2.cu
     src/groupby/sort/group_max.cu
     src/groupby/sort/group_min.cu
+    src/groupby/sort/group_merge_lists.cu
+    src/groupby/sort/group_merge_m2.cu
     src/groupby/sort/group_nth_element.cu
     src/groupby/sort/group_nunique.cu
     src/groupby/sort/group_product.cu
diff --git a/cpp/include/cudf/aggregation.hpp b/cpp/include/cudf/aggregation.hpp
index 8602b7086f1..30da2976bda 100644
--- a/cpp/include/cudf/aggregation.hpp
+++ b/cpp/include/cudf/aggregation.hpp
@@ -164,9 +164,9 @@ std::unique_ptr<Base> make_mean_aggregation();
 /**
  * @brief Factory to create a M2 aggregation
  *
- * A M2 aggregation is groupwise sum of squares of differences from the group mean. From this,
- * a `VARIANCE` aggregation can be computed as `M2 / (N - ddof)`, where `N` is the population size
- * and `ddof` is the delta degrees of freedom.
+ * A M2 aggregation is groupwise sum of squares of differences from the group mean. It produces the
+ * intermediate values that are used to compute variance and standard deviation in distributed
+ * computing.
  */
 template <typename Base = aggregation>
 std::unique_ptr<Base> make_m2_aggregation();
@@ -347,10 +347,7 @@ std::unique_ptr<Base> make_merge_sets_aggregation(null_equality nulls_equal = nu
  *
  * This aggregation is designed specificly to perform distributed computing of `M2`
  * aggregation. The partial results input to this aggregation is a structs column with children are
- * columns generated by three groupby aggregations: `SUM`, `COUNT_VALID`, and `M2`.
- *
- * Since the partial results output from those aggregations do not contain nulls, the input structs
- * column and its children must all be non-nullable.
+ * columns generated by three groupby aggregations: `M2`, `COUNT_VALID`, and `MEAN`.
  */
 template <typename Base = aggregation>
 std::unique_ptr<Base> make_merge_m2_aggregation();
diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp
index 8c12aa57f6d..78db3ad1f68 100644
--- a/cpp/src/groupby/sort/aggregate.cpp
+++ b/cpp/src/groupby/sort/aggregate.cpp
@@ -243,7 +243,7 @@ void aggregate_result_functor::operator()<aggregation::M2>(aggregation const& ag
 {
   if (cache.has_result(col_idx, agg)) return;
 
-  auto mean_agg = make_mean_aggregation();
+  auto const mean_agg = make_mean_aggregation();
   operator()<aggregation::MEAN>(*mean_agg);
   auto const mean_result = cache.get_result(col_idx, *mean_agg);
 
@@ -258,27 +258,21 @@ void aggregate_result_functor::operator()<aggregation::VARIANCE>(aggregation con
 {
   if (cache.has_result(col_idx, agg)) return;
 
-  auto const count_agg = make_count_aggregation();
+  auto var_agg   = dynamic_cast<cudf::detail::var_aggregation const&>(agg);
+  auto mean_agg  = make_mean_aggregation();
+  auto count_agg = make_count_aggregation();
+  operator()<aggregation::MEAN>(*mean_agg);
   operator()<aggregation::COUNT_VALID>(*count_agg);
+  column_view mean_result = cache.get_result(col_idx, *mean_agg);
+  column_view group_sizes = cache.get_result(col_idx, *count_agg);
 
-  auto const group_sizes   = cache.get_result(col_idx, *count_agg);
-  auto const& group_labels = helper.group_labels(stream);
-  auto const ddof          = dynamic_cast<cudf::detail::var_aggregation const&>(agg)._ddof;
-
-  auto result = [&] {
-    if (auto m2_agg = make_m2_aggregation(); cache.has_result(col_idx, *m2_agg)) {
-      auto const group_m2 = cache.get_result(col_idx, *m2_agg);
-      return detail::group_var_from_m2(group_m2, group_sizes, group_labels, ddof, stream, mr);
-    } else {
-      auto mean_agg = make_mean_aggregation();
-      operator()<aggregation::MEAN>(*mean_agg);
-      auto const mean_result = cache.get_result(col_idx, *mean_agg);
-
-      return detail::group_var(
-        get_grouped_values(), mean_result, group_sizes, group_labels, ddof, stream, mr);
-    }
-  }();
-
+  auto result = detail::group_var(get_grouped_values(),
+                                  mean_result,
+                                  group_sizes,
+                                  helper.group_labels(stream),
+                                  var_agg._ddof,
+                                  stream,
+                                  mr);
   cache.add_result(col_idx, agg, std::move(result));
 };
 
@@ -498,11 +492,15 @@ void aggregate_result_functor::operator()<aggregation::MERGE_SETS>(aggregation c
 /**
  * @brief Perform merging for the M2 values that correspond to the same key value.
  *
- * The partial results input to this aggregation is a structs column with children are
- * columns generated by three groupby aggregations: `SUM`, `COUNT_VALID`, and `M2` performed on
- * partitioned datasets. After distributedly computed, the results output from these aggregations
+ * The partial results input to this aggregation is a structs column with children are columns
+ * generated by three other groupby aggregations: `M2`, `MEAN`, `COUNT_VALID` performed
+ * on partitioned datasets. After distributedly computed, the results output from these aggregations
  * are (vertically) concatenated before assembling into a structs column given as the values column
  * for this aggregation.
+ *
+ * The values of M2 are merged following the parallel algorithm described here:
+ * https://www.wikiwand.com/en/Algorithms_for_calculating_variance#/Parallel_algorithm
+ *
  */
 template <>
 void aggregate_result_functor::operator()<aggregation::MERGE_M2>(aggregation const& agg)
diff --git a/cpp/src/groupby/sort/group_m2.cu b/cpp/src/groupby/sort/group_m2.cu
new file mode 100644
index 00000000000..2de57a94923
--- /dev/null
+++ b/cpp/src/groupby/sort/group_m2.cu
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "group_reductions.hpp"
+
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/detail/null_mask.hpp>
+#include <cudf/dictionary/detail/iterator.cuh>
+#include <cudf/dictionary/dictionary_column_view.hpp>
+#include <cudf/utilities/span.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/reduce.h>
+
+namespace cudf {
+namespace groupby {
+namespace detail {
+namespace {
+
+template <typename ResultType, typename Iterator>
+struct m2_transform {
+  column_device_view const d_values;
+  Iterator const values_iter;
+  ResultType const* d_means;
+  size_type const* d_group_labels;
+
+  __device__ ResultType operator()(size_type const idx) const noexcept
+  {
+    if (d_values.is_null(idx)) return 0.0;
+
+    auto const x         = static_cast<ResultType>(values_iter[idx]);
+    auto const group_idx = d_group_labels[idx];
+    auto const mean      = d_means[group_idx];
+    auto const diff      = x - mean;
+    return diff * diff;
+  }
+};
+
+template <typename ResultType, typename Iterator>
+void reduce_by_key_fn(column_device_view const& values,
+                      Iterator values_iter,
+                      cudf::device_span<size_type const> group_labels,
+                      ResultType const* d_means,
+                      ResultType* d_result,
+                      rmm::cuda_stream_view stream)
+{
+  auto const var_iter = cudf::detail::make_counting_transform_iterator(
+    size_type{0},
+    m2_transform<ResultType, decltype(values_iter)>{
+      values, values_iter, d_means, group_labels.data()});
+
+  thrust::reduce_by_key(rmm::exec_policy(stream),
+                        group_labels.begin(),
+                        group_labels.end(),
+                        var_iter,
+                        thrust::make_discard_iterator(),
+                        d_result);
+}
+
+struct m2_functor {
+  template <typename T>
+  std::enable_if_t<std::is_arithmetic<T>::value, std::unique_ptr<column>> operator()(
+    column_view const& values,
+    column_view const& group_means,
+    cudf::device_span<size_type const> group_labels,
+    rmm::cuda_stream_view stream,
+    rmm::mr::device_memory_resource* mr)
+  {
+    using ResultType = cudf::detail::target_type_t<T, aggregation::Kind::M2>;
+    auto result      = make_numeric_column(
+      data_type(type_to_id<ResultType>()), group_means.size(), mask_state::UNALLOCATED, stream, mr);
+
+    auto const values_dv_ptr = column_device_view::create(values, stream);
+    auto const d_values      = *values_dv_ptr;
+    auto const d_means       = group_means.data<ResultType>();
+    auto const d_result      = result->mutable_view().data<ResultType>();
+
+    if (!cudf::is_dictionary(values.type())) {
+      auto const values_iter = d_values.begin<T>();
+      reduce_by_key_fn(d_values, values_iter, group_labels, d_means, d_result, stream);
+    } else {
+      auto const values_iter =
+        cudf::dictionary::detail::make_dictionary_iterator<T>(*values_dv_ptr);
+      reduce_by_key_fn(d_values, values_iter, group_labels, d_means, d_result, stream);
+    }
+
+    // M2 column values should have the same bitmask as means's.
+    if (group_means.nullable()) {
+      result->set_null_mask(cudf::detail::copy_bitmask(group_means, stream, mr),
+                            group_means.null_count());
+    }
+
+    return result;
+  }
+
+  template <typename T, typename... Args>
+  std::enable_if_t<!std::is_arithmetic<T>::value, std::unique_ptr<column>> operator()(Args&&...)
+  {
+    CUDF_FAIL("Only numeric types are supported in M2 groupby aggregation");
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<column> group_m2(column_view const& values,
+                                 column_view const& group_means,
+                                 cudf::device_span<size_type const> group_labels,
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource* mr)
+{
+  auto values_type = cudf::is_dictionary(values.type())
+                       ? dictionary_column_view(values).keys().type()
+                       : values.type();
+
+  return type_dispatcher(values_type, m2_functor{}, values, group_means, group_labels, stream, mr);
+}
+
+}  // namespace detail
+}  // namespace groupby
+}  // namespace cudf
diff --git a/cpp/src/groupby/sort/group_merge_m2.cu b/cpp/src/groupby/sort/group_merge_m2.cu
index 4ffc29c2a56..8176c2ea712 100644
--- a/cpp/src/groupby/sort/group_merge_m2.cu
+++ b/cpp/src/groupby/sort/group_merge_m2.cu
@@ -14,59 +14,113 @@
  * limitations under the License.
  */
 
+#include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
-#include <cudf/lists/lists_column_view.hpp>
+#include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/detail/valid_if.cuh>
+#include <cudf/dictionary/detail/iterator.cuh>
+#include <cudf/structs/structs_column_view.hpp>
 #include <cudf/utilities/span.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
-#include <thrust/gather.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/reduce.h>
 
 namespace cudf {
 namespace groupby {
 namespace detail {
+
+namespace {
+template <class ResultType>
+struct accumulate_fn {
+  ResultType M2_a;
+  ResultType mean_a;
+  size_type n_a;
+
+  void __device__ operator()(ResultType const M2_b,
+                             ResultType const mean_b,
+                             size_type const n_b) noexcept
+  {
+    if (n_b == 0) { return; }
+
+    auto const n_ab  = n_a + n_b;
+    auto const delta = mean_b - mean_a;
+    M2_a +=
+      M2_b + (delta * delta) * static_cast<ResultType>(n_a) * static_cast<ResultType>(n_b) / n_ab;
+    mean_a = (mean_a * n_a + mean_b * n_b) / n_ab;
+    n_a    = n_ab;
+  }
+};
+
+}  // namespace
+
 std::unique_ptr<column> group_merge_m2(column_view const& values,
                                        cudf::device_span<size_type const> group_offsets,
                                        size_type num_groups,
                                        rmm::cuda_stream_view stream,
                                        rmm::mr::device_memory_resource* mr)
 {
-  CUDF_EXPECTS(values.type().id() == type_id::LIST,
-               "Input to `group_merge_lists` must be a lists column.");
-  CUDF_EXPECTS(!values.nullable(),
-               "Input to `group_merge_lists` must be a non-nullable lists column.");
-
-  auto offsets_column = make_numeric_column(
-    data_type(type_to_id<offset_type>()), num_groups + 1, mask_state::UNALLOCATED, stream, mr);
-
-  // Generate offsets of the output lists column by gathering from the provided group offsets and
-  // the input list offsets.
-  //
-  // For example:
-  //   values        = [[2, 1], [], [4, -1, -2], [], [<NA>, 4, <NA>]]
-  //   list_offsets  =  [0,     2,   2,           5,   5              8]
-  //   group_offsets = [0,                        3,                  5]
-  //
-  //   then, the output offsets_column is [0, 5, 8].
-  //
-  thrust::gather(rmm::exec_policy(stream),
-                 group_offsets.begin(),
-                 group_offsets.end(),
-                 lists_column_view(values).offsets_begin(),
-                 offsets_column->mutable_view().template begin<offset_type>());
-
-  // The child column of the output lists column is just copied from the input column.
-  auto child_column =
-    std::make_unique<column>(lists_column_view(values).get_sliced_child(stream), stream, mr);
-
-  return make_lists_column(num_groups,
-                           std::move(offsets_column),
-                           std::move(child_column),
-                           0,
-                           rmm::device_buffer{},
-                           stream,
-                           mr);
+  CUDF_EXPECTS(values.type().id() == type_id::STRUCT,
+               "Input to `group_merge_m2` must be a structs column.");
+  CUDF_EXPECTS(values.num_children() == 3,
+               "Input to `group_merge_m2` must be a structs column having 3 children columns.");
+
+  using ResultType = id_to_type<type_id::FLOAT64>;
+  static_assert(
+    std::is_same_v<cudf::detail::target_type_t<ResultType, aggregation::Kind::M2>, ResultType>);
+  CUDF_EXPECTS(values.child(0).type().id() == type_to_id<ResultType>() &&
+                 values.child(1).type().id() == type_to_id<ResultType>() &&
+                 values.child(2).type().id() == type_id::INT32,
+               "Input to `group_merge_m2` must be a structs column having children columns "
+               "containing tuples of groupwise (M2_value, mean, valid_count).");
+
+  auto result = make_numeric_column(
+    data_type(type_to_id<ResultType>()), num_groups, mask_state::UNALLOCATED, stream, mr);
+
+  auto const M2_values   = values.child(0);
+  auto const mean_values = values.child(1);
+  auto const count_valid = values.child(2);
+  auto const iter        = thrust::make_counting_iterator<size_type>(0);
+  auto validities        = rmm::device_uvector<int8_t>(num_groups, stream);
+
+  thrust::transform(rmm::exec_policy(stream),
+                    iter,
+                    iter + num_groups,
+                    result->mutable_view().data<ResultType>(),
+                    [d_M2      = M2_values.template begin<ResultType>(),
+                     d_mean    = mean_values.template begin<ResultType>(),
+                     d_count   = count_valid.template begin<size_type>(),
+                     d_offsets = group_offsets.begin(),
+                     d_valid   = validities.begin()] __device__(auto const group_idx) {
+                      auto const start_idx = d_offsets[group_idx],
+                                 end_idx   = d_offsets[group_idx + 1];
+
+                      // Firstly, this stores (M2, mean, valid_count) of the first partial result.
+                      // Then, merge all the following partial results into it.
+                      auto accumulator = accumulate_fn<ResultType>{
+                        d_M2[start_idx], d_mean[start_idx], d_count[start_idx]};
+
+                      for (auto idx = start_idx + 1; idx < end_idx; ++idx) {
+                        auto const n_b    = d_count[idx];
+                        auto const M2_b   = n_b > 0 ? d_M2[idx] : ResultType{0};
+                        auto const mean_b = n_b > 0 ? d_mean[idx] : ResultType{0};
+                        accumulator(M2_b, mean_b, n_b);
+                      }
+
+                      // If there are all nulls in the partial results (i.e., sum of valid counts is
+                      // zero), then output a null.
+                      d_valid[group_idx] = accumulator.n_a > 0;
+                      return accumulator.n_a > 0 ? accumulator.M2_a : ResultType{0};
+                    });
+
+  auto [null_mask, null_count] = cudf::detail::valid_if(
+    validities.begin(), validities.end(), thrust::identity<int8_t>{}, stream, mr);
+  if (null_count > 0) { result->set_null_mask(null_mask, null_count); }
+
+  return result;
 }
 
 }  // namespace detail
diff --git a/cpp/src/groupby/sort/group_reductions.hpp b/cpp/src/groupby/sort/group_reductions.hpp
index 032973481a0..d3e043a0ac6 100644
--- a/cpp/src/groupby/sort/group_reductions.hpp
+++ b/cpp/src/groupby/sort/group_reductions.hpp
@@ -218,18 +218,19 @@ std::unique_ptr<column> group_count_all(cudf::device_span<size_type const> group
                                         rmm::mr::device_memory_resource* mr);
 
 /**
- * @brief Internal API to calculate groupwise sum of squares of differences from the current mean
+ * @brief Internal API to calculate groupwise sum of squares of differences from group means.
  *
- * @code{.pseudo}
- * values       = [2, 1, 4, -1, -2, <NA>, 4, <NA>]
- * group_labels = [0, 0, 0,  1,  1,    2, 2,    3]
- * group_means  = [2.333333, -1.5, 4.0, <NA>]
+ * If there are only nulls in the group, the output value of that group will be `0`.
  *
- * group_m2     = [4.666666, 1.0, 0, <NA>]
+ * @code{.pseudo}
+ * values        = [2, 1, 4, -1, -2, <NA>, 4, <NA>]
+ * group_labels  = [0, 0, 0,  1,  1,    2, 2,    3]
+ * group_means   = [2.333333, -1.5, 4.0, <NA>]
+ * group_m2(...) = [4.666666,  1.0, 0.0, <NA>]
  * @endcode
  *
- * @param values Grouped values to get M2 of
- * @param group_means Pre-calculated groupwise MEAN
+ * @param values Grouped values to compute M2 values
+ * @param group_means Pre-computed groupwise MEAN
  * @param group_labels ID of group corresponding value in @p values belongs to
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @param stream CUDA stream used for device memory operations and kernel launches.
@@ -240,24 +241,6 @@ std::unique_ptr<column> group_m2(column_view const& values,
                                  rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr);
 
-/**
- * @brief group_var
- * @param values
- * @param group_means
- * @param group_sizes
- * @param group_labels
- * @param ddof
- * @param stream
- * @param mr
- * @return
- */
-std::unique_ptr<column> group_var_from_m2(column_view const& group_m2,
-                                          column_view const& group_sizes,
-                                          cudf::device_span<size_type const> group_labels,
-                                          size_type ddof,
-                                          rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr);
-
 /**
  * @brief Internal API to calculate groupwise variance
  *
@@ -433,23 +416,21 @@ std::unique_ptr<column> group_merge_lists(column_view const& values,
                                           rmm::cuda_stream_view stream,
                                           rmm::mr::device_memory_resource* mr);
 
-// todo
 /**
- * @brief Internal API to merge grouped m2 values corresponding to the same key.
+ * @brief Internal API to merge grouped M2 values corresponding to the same key.
  *
- * @code{.pseudo}
- * values        = [[2, 1], [], [4, -1, -2], [], [<NA>, 4, <NA>]]
- * group_offsets = [0,                        3,                  5]
- * num_groups    = 2
+ * The values of M2 are merged following the parallel algorithm described here:
+ * https://www.wikiwand.com/en/Algorithms_for_calculating_variance#/Parallel_algorithm
  *
- * group_merge_lists(...) = [[2, 1, 4, -1, -2], [<NA>, 4, <NA>]]
- * @endcode
+ * Merging M2 values require accessing to partial M2 values and also groupwise means and group valid
+ * counts. Thus, the input to this aggregation need to be a structs column containing tuples of
+ * groupwise `(M2_value, mean, valid_count)`.
  *
- * @param values Grouped values (lists column) to collect.
+ * @param values Grouped values (tuples of groupwise `(M2_value, mean, valid_count)`) to merge M2.
  * @param group_offsets Offsets of groups' starting points within @p values.
  * @param num_groups Number of groups.
+ * @param mr Device memory resource used to allocate the returned column's device memory
  * @param stream CUDA stream used for device memory operations and kernel launches.
- * @param mr Device memory resource used to allocate the returned column's device memory.
  */
 std::unique_ptr<column> group_merge_m2(column_view const& values,
                                        cudf::device_span<size_type const> group_offsets,
diff --git a/cpp/src/groupby/sort/group_std.cu b/cpp/src/groupby/sort/group_std.cu
index c63db6c0c4e..9ebb516ee14 100644
--- a/cpp/src/groupby/sort/group_std.cu
+++ b/cpp/src/groupby/sort/group_std.cu
@@ -49,22 +49,16 @@ struct var_transform {
   {
     if (d_values.is_null(i)) return 0.0;
 
-    auto const x         = static_cast<ResultType>(values_iter[i]);
-    auto const group_idx = d_group_labels[i];
+    ResultType x = static_cast<ResultType>(values_iter[i]);
 
-    if (d_group_sizes) {  // for variance/std
-      auto const group_size = d_group_sizes[group_idx];
+    size_type group_idx  = d_group_labels[i];
+    size_type group_size = d_group_sizes[group_idx];
 
-      // prevent divide by zero error
-      if (group_size == 0 or group_size - ddof <= 0) return 0.0;
+    // prevent divide by zero error
+    if (group_size == 0 or group_size - ddof <= 0) return 0.0;
 
-      auto const mean = d_means[group_idx];
-      return (x - mean) * (x - mean) / (group_size - ddof);
-    } else {  // for m2
-      auto const mean = d_means[group_idx];
-      auto const diff = x - mean;
-      return diff * diff;
-    }
+    ResultType mean = d_means[group_idx];
+    return (x - mean) * (x - mean) / (group_size - ddof);
   }
 };
 
@@ -91,50 +85,6 @@ void reduce_by_key_fn(column_device_view const& values,
                         d_result);
 }
 
-struct m2_functor {
-  template <typename T>
-  std::enable_if_t<std::is_arithmetic<T>::value, std::unique_ptr<column>> operator()(
-    column_view const& values,
-    column_view const& group_means,
-    cudf::device_span<size_type const> group_labels,
-    rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr)
-  {
-// Running this in debug build causes a runtime error:
-// `reduce_by_key failed on 2nd step: invalid device function`
-#if !defined(__CUDACC_DEBUG__)
-    using ResultType = cudf::detail::target_type_t<T, aggregation::Kind::M2>;
-
-    auto result = make_numeric_column(
-      data_type(type_to_id<ResultType>()), values.size(), mask_state::UNINITIALIZED, stream, mr);
-
-    auto const values_dv_ptr = column_device_view::create(values, stream);
-    auto const values_dv     = *values_dv_ptr;
-
-    auto d_means  = group_means.data<ResultType>();
-    auto d_result = result->mutable_view().data<ResultType>();
-
-    if (!cudf::is_dictionary(values.type())) {
-      auto const values_iter = values_dv.begin<T>();
-      reduce_by_key_fn(values_dv, values_iter, group_labels, d_means, nullptr, 0, d_result, stream);
-    } else {
-      auto values_iter = cudf::dictionary::detail::make_dictionary_iterator<T>(*values_dv_ptr);
-      reduce_by_key_fn(values_dv, values_iter, group_labels, d_means, nullptr, 0, d_result, stream);
-    }
-
-    return result;
-#else
-    CUDF_FAIL("Groupby m2 aggregation is not supported in debug build");
-#endif
-  }
-
-  template <typename T, typename... Args>
-  std::enable_if_t<!std::is_arithmetic<T>::value, std::unique_ptr<column>> operator()(Args&&...)
-  {
-    CUDF_FAIL("Only numeric types are supported in groupby m2 aggregation");
-  }
-};
-
 struct var_functor {
   template <typename T>
   std::enable_if_t<std::is_arithmetic<T>::value, std::unique_ptr<column>> operator()(
@@ -202,30 +152,6 @@ struct var_functor {
 
 }  // namespace
 
-std::unique_ptr<column> group_m2(column_view const& values,
-                                 column_view const& group_means,
-                                 cudf::device_span<size_type const> group_labels,
-                                 rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
-{
-  auto values_type = cudf::is_dictionary(values.type())
-                       ? dictionary_column_view(values).keys().type()
-                       : values.type();
-
-  return type_dispatcher(values_type, m2_functor{}, values, group_means, group_labels, stream, mr);
-}
-
-std::unique_ptr<column> group_var_from_m2(column_view const& group_m2,
-                                          column_view const& group_sizes,
-                                          cudf::device_span<size_type const> group_labels,
-                                          size_type ddof,
-                                          rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
-{
-  // TODO
-  return std::make_unique<column>();
-}
-
 std::unique_ptr<column> group_var(column_view const& values,
                                   column_view const& group_means,
                                   column_view const& group_sizes,

From 8c74cb44eb3e0cd12ce76942defadebcf21ad982 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Thu, 1 Jul 2021 17:11:02 -0600
Subject: [PATCH 05/19] Finish unit tests for M2 aggregation

---
 .../cudf/detail/aggregation/aggregation.hpp   |   1 +
 cpp/src/groupby/sort/aggregate.cpp            |   1 -
 cpp/src/groupby/sort/group_m2.cu              |   2 -
 cpp/tests/CMakeLists.txt                      |   2 +
 cpp/tests/groupby/m2_tests.cpp                | 223 ++++++++++++++++++
 5 files changed, 226 insertions(+), 3 deletions(-)
 create mode 100644 cpp/tests/groupby/m2_tests.cpp

diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp
index d4176e1ab6c..c51eeb6bc7a 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.hpp
+++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp
@@ -1117,6 +1117,7 @@ CUDA_HOST_DEVICE_CALLABLE decltype(auto) aggregation_dispatcher(aggregation::Kin
       return f.template operator()<aggregation::SUM_OF_SQUARES>(std::forward<Ts>(args)...);
     case aggregation::MEAN:
       return f.template operator()<aggregation::MEAN>(std::forward<Ts>(args)...);
+    case aggregation::M2: return f.template operator()<aggregation::M2>(std::forward<Ts>(args)...);
     case aggregation::VARIANCE:
       return f.template operator()<aggregation::VARIANCE>(std::forward<Ts>(args)...);
     case aggregation::STD:
diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp
index 78db3ad1f68..59c40c24198 100644
--- a/cpp/src/groupby/sort/aggregate.cpp
+++ b/cpp/src/groupby/sort/aggregate.cpp
@@ -500,7 +500,6 @@ void aggregate_result_functor::operator()<aggregation::MERGE_SETS>(aggregation c
  *
  * The values of M2 are merged following the parallel algorithm described here:
  * https://www.wikiwand.com/en/Algorithms_for_calculating_variance#/Parallel_algorithm
- *
  */
 template <>
 void aggregate_result_functor::operator()<aggregation::MERGE_M2>(aggregation const& agg)
diff --git a/cpp/src/groupby/sort/group_m2.cu b/cpp/src/groupby/sort/group_m2.cu
index 2de57a94923..533a8b93d2e 100644
--- a/cpp/src/groupby/sort/group_m2.cu
+++ b/cpp/src/groupby/sort/group_m2.cu
@@ -14,8 +14,6 @@
  * limitations under the License.
  */
 
-#include "group_reductions.hpp"
-
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/column/column_view.hpp>
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 4360b418e95..bb7c3233bc3 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -62,11 +62,13 @@ ConfigureTest(GROUPBY_TEST
     groupby/count_tests.cpp
     groupby/groups_tests.cpp
     groupby/keys_tests.cpp
+    groupby/m2_tests.cpp
     groupby/min_tests.cpp
     groupby/max_scan_tests.cpp
     groupby/max_tests.cpp
     groupby/mean_tests.cpp
     groupby/median_tests.cpp
+    groupby/merge_m2_tests.cpp
     groupby/merge_lists_tests.cpp
     groupby/merge_sets_tests.cpp
     groupby/min_scan_tests.cpp
diff --git a/cpp/tests/groupby/m2_tests.cpp b/cpp/tests/groupby/m2_tests.cpp
new file mode 100644
index 00000000000..656a5af09df
--- /dev/null
+++ b/cpp/tests/groupby/m2_tests.cpp
@@ -0,0 +1,223 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/aggregation.hpp>
+#include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/groupby.hpp>
+
+using namespace cudf::test::iterators;
+
+namespace {
+constexpr bool print_all{false};                                 // For debugging
+constexpr int32_t null{0};                                       // Mark for null elements
+constexpr double NaN{std::numeric_limits<double>::quiet_NaN()};  // Mark for NaN double elements
+
+template <class T>
+using keys_col = cudf::test::fixed_width_column_wrapper<T, int32_t>;
+
+template <class T>
+using vals_col = cudf::test::fixed_width_column_wrapper<T>;
+
+template <class T>
+using M2s_col = cudf::test::fixed_width_column_wrapper<T>;
+
+auto compute_M2(cudf::column_view const& keys, cudf::column_view const& values)
+{
+  std::vector<cudf::groupby::aggregation_request> requests;
+  requests.emplace_back(cudf::groupby::aggregation_request());
+  requests[0].values = values;
+  requests[0].aggregations.emplace_back(cudf::make_m2_aggregation());
+
+  auto gb_obj = cudf::groupby::groupby(cudf::table_view({keys}));
+  auto result = gb_obj.aggregate(requests);
+  return std::make_pair(std::move(result.first->release()[0]),
+                        std::move(result.second[0].results[0]));
+}
+}  // namespace
+
+template <class T>
+struct GroupbyM2TypedTest : public cudf::test::BaseFixture {
+};
+
+using TestTypes =
+  cudf::test::Concat<cudf::test::IntegralTypesNotBool, cudf::test::FloatingPointTypes>;
+TYPED_TEST_SUITE(GroupbyM2TypedTest, TestTypes);
+
+TYPED_TEST(GroupbyM2TypedTest, EmptyInput)
+{
+  using T = TypeParam;
+  using R = cudf::detail::target_type_t<T, cudf::aggregation::M2>;
+
+  auto const keys = keys_col<T>{};
+  auto const vals = vals_col<T>{};
+
+  auto const [out_keys, out_M2s] = compute_M2(keys, vals);
+  auto const expected_M2s        = M2s_col<R>{};
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(keys, *out_keys, print_all);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_M2s, *out_M2s, print_all);
+}
+
+TYPED_TEST(GroupbyM2TypedTest, AllNullKeysInput)
+{
+  using T = TypeParam;
+  using R = cudf::detail::target_type_t<T, cudf::aggregation::M2>;
+
+  auto const keys = keys_col<T>{{1, 2, 3}, all_nulls()};
+  auto const vals = vals_col<T>{3, 4, 5};
+
+  auto const [out_keys, out_M2s] = compute_M2(keys, vals);
+  auto const expected_keys       = keys_col<T>{};
+  auto const expected_M2s        = M2s_col<R>{};
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_keys, *out_keys, print_all);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_M2s, *out_M2s, print_all);
+}
+
+TYPED_TEST(GroupbyM2TypedTest, AllNullValuesInput)
+{
+  using T = TypeParam;
+  using R = cudf::detail::target_type_t<T, cudf::aggregation::M2>;
+
+  auto const keys = keys_col<T>{1, 2, 3};
+  auto const vals = vals_col<T>{{3, 4, 5}, all_nulls()};
+
+  auto const [out_keys, out_M2s] = compute_M2(keys, vals);
+  auto const expected_M2s        = M2s_col<R>{{null, null, null}, all_nulls()};
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(keys, *out_keys, print_all);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_M2s, *out_M2s, print_all);
+}
+
+TYPED_TEST(GroupbyM2TypedTest, SimpleInput)
+{
+  using T = TypeParam;
+  using R = cudf::detail::target_type_t<T, cudf::aggregation::M2>;
+
+  // key = 1: vals = [0, 3, 6]
+  // key = 2: vals = [1, 4, 5, 9]
+  // key = 3: vals = [2, 7, 8]
+  auto const keys = keys_col<T>{1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
+  auto const vals = vals_col<T>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+
+  auto const [out_keys, out_M2s] = compute_M2(keys, vals);
+  auto const expected_keys       = keys_col<T>{1, 2, 3};
+  auto const expected_M2s        = M2s_col<R>{18.0, 32.75, 20.0 + 2.0 / 3.0};
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_keys, *out_keys, print_all);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_M2s, *out_M2s, print_all);
+}
+
+TYPED_TEST(GroupbyM2TypedTest, ValuesHaveNulls)
+{
+  using T = TypeParam;
+  using R = cudf::detail::target_type_t<T, cudf::aggregation::M2>;
+
+  auto const keys = keys_col<T>{1, 2, 3, 4, 5, 2, 3, 2};
+  auto const vals = vals_col<T>{{0, null, 2, 3, null, 5, 6, 7}, nulls_at({1, 4})};
+
+  auto const [out_keys, out_M2s] = compute_M2(keys, vals);
+  auto const expected_keys       = keys_col<T>{1, 2, 3, 4, 5};
+  auto const expected_M2s        = M2s_col<R>{{0.0, 2.0, 8.0, 0.0, 0.0 /*NULL*/}, null_at(4)};
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_keys, *out_keys, print_all);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_M2s, *out_M2s, print_all);
+}
+
+TYPED_TEST(GroupbyM2TypedTest, KeysAndValuesHaveNulls)
+{
+  using T = TypeParam;
+  using R = cudf::detail::target_type_t<T, cudf::aggregation::M2>;
+
+  // key = 1: vals = [null, 3, 6]
+  // key = 2: vals = [1, 4, null, 9]
+  // key = 3: vals = [2, 8]
+  // key = 4: vals = [null]
+  auto const keys = keys_col<T>{{1, 2, 3, 1, 2, 2, 1, null, 3, 2, 4}, null_at(7)};
+  auto const vals = vals_col<T>{{null, 1, 2, 3, 4, null, 6, 7, 8, 9, null}, nulls_at({0, 5, 10})};
+
+  auto const [out_keys, out_M2s] = compute_M2(keys, vals);
+  auto const expected_keys       = keys_col<T>{1, 2, 3, 4};
+  auto const expected_M2s = M2s_col<R>{{4.5, 32.0 + 2.0 / 3.0, 18.0, 0.0 /*NULL*/}, null_at(3)};
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_keys, *out_keys, print_all);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_M2s, *out_M2s, print_all);
+}
+
+TYPED_TEST(GroupbyM2TypedTest, InputHaveNullsAndNaNs)
+{
+  using T = TypeParam;
+  using R = cudf::detail::target_type_t<T, cudf::aggregation::M2>;
+
+  // key = 1: vals = [0, 3, 6]
+  // key = 2: vals = [1, 4, NaN, 9]
+  // key = 3: vals = [null, 2, 8]
+  // key = 4: vals = [null, 10, NaN]
+  auto const keys = keys_col<T>{{4, 3, 1, 2, 3, 1, 2, 2, 1, null, 3, 2, 4, 4}, null_at(9)};
+  auto const vals = vals_col<double>{
+    {0.0 /*NULL*/, 0.0 /*NULL*/, 0.0, 1.0, 2.0, 3.0, 4.0, NaN, 6.0, 7.0, 8.0, 9.0, 10.0, NaN},
+    nulls_at({0, 1})};
+
+  auto const [out_keys, out_M2s] = compute_M2(keys, vals);
+  auto const expected_keys       = keys_col<T>{1, 2, 3, 4};
+  auto const expected_M2s        = M2s_col<R>{18.0, NaN, 18.0, NaN};
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_keys, *out_keys, print_all);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_M2s, *out_M2s, print_all);
+}
+
+TYPED_TEST(GroupbyM2TypedTest, SlicedColumnsInput)
+{
+  using T = TypeParam;
+  using R = cudf::detail::target_type_t<T, cudf::aggregation::M2>;
+
+  // This test should compute M2 aggregation on the same dataset as the InputHaveNullsAndNaNs test.
+  // i.e.:
+  //
+  // key = 1: vals = [0, 3, 6]
+  // key = 2: vals = [1, 4, NaN, 9]
+  // key = 3: vals = [null, 2, 8]
+  // key = 4: vals = [null, 10, NaN]
+
+  auto const keys_original = keys_col<T>{{
+                                           1, 2, 3, 4, 5, 1, 2, 3, 4, 5,  // discarded, don't care
+                                           4, 3, 1, 2, 3, 1, 2, 2, 1, null, 3, 2, 4, 4,  // used
+                                           1, 2, 3, 4, 5, 1, 2, 3, 4, 5  // discarded, don't care
+                                         },
+                                         null_at(19)};
+  auto const vals_original = vals_col<double>{
+    {
+      3.0, 2.0,  5.0,  4.0,  6.0, 9.0, 1.0, 0.0,  1.0,  7.0,  // discarded, don't care
+      0.0, 0.0,  0.0,  1.0,  2.0, 3.0, 4.0, NaN,  6.0,  7.0, 8.0, 9.0, 10.0, NaN,  // used
+      9.0, 10.0, 11.0, 12.0, 0.0, 5.0, 1.0, 20.0, 19.0, 15.0  // discarded, don't care
+    },
+    nulls_at({10, 11})};
+
+  auto const keys = cudf::slice(keys_original, {10, 24})[0];
+  auto const vals = cudf::slice(vals_original, {10, 24})[0];
+
+  auto const [out_keys, out_M2s] = compute_M2(keys, vals);
+  auto const expected_keys       = keys_col<T>{1, 2, 3, 4};
+  auto const expected_M2s        = M2s_col<R>{18.0, NaN, 18.0, NaN};
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_keys, *out_keys, print_all);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_M2s, *out_M2s, print_all);
+}

From 44a3b16e419e60a1db4d97d4f7b6eb8ef0e69dc7 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Fri, 2 Jul 2021 13:32:55 -0600
Subject: [PATCH 06/19] Rewrite doxygen

---
 cpp/include/cudf/aggregation.hpp          | 10 ++++++----
 cpp/src/groupby/sort/aggregate.cpp        | 13 +++++++++----
 cpp/src/groupby/sort/group_reductions.hpp |  8 ++++++--
 3 files changed, 21 insertions(+), 10 deletions(-)

diff --git a/cpp/include/cudf/aggregation.hpp b/cpp/include/cudf/aggregation.hpp
index 30da2976bda..e309c80a50a 100644
--- a/cpp/include/cudf/aggregation.hpp
+++ b/cpp/include/cudf/aggregation.hpp
@@ -309,7 +309,7 @@ std::unique_ptr<Base> make_udf_aggregation(udf_type type,
  * @brief Factory to create a MERGE_LISTS aggregation.
  *
  * Given a lists column, this aggregation merges all the lists corresponding to the same key value
- * into one list. It is designed specificly to merge the partial results of multiple (distributed)
+ * into one list. It is designed specifically to merge the partial results of multiple (distributed)
  * groupby `COLLECT_LIST` aggregations into a final `COLLECT_LIST` result. As such, it requires the
  * input lists column to be non-nullable (the child column containing list entries is not subjected
  * to this requirement).
@@ -324,7 +324,7 @@ std::unique_ptr<Base> make_merge_lists_aggregation();
  * value into one list, then it drops all the duplicate entries in each lists, producing a lists
  * column containing non-repeated entries.
  *
- * This aggregation is designed specificly to merge the partial results of multiple (distributed)
+ * This aggregation is designed specifically to merge the partial results of multiple (distributed)
  * groupby `COLLECT_LIST` or `COLLECT_SET` aggregations into a final `COLLECT_SET` result. As such,
  * it requires the input lists column to be non-nullable (the child column containing list entries
  * is not subjected to this requirement).
@@ -345,9 +345,11 @@ std::unique_ptr<Base> make_merge_sets_aggregation(null_equality nulls_equal = nu
 /**
  * @brief Factory to create a MERGE_M2 aggregation
  *
- * This aggregation is designed specificly to perform distributed computing of `M2`
+ * This aggregation is designed specifically to accommodate distributed computing of `M2`
  * aggregation. The partial results input to this aggregation is a structs column with children are
- * columns generated by three groupby aggregations: `M2`, `COUNT_VALID`, and `MEAN`.
+ * columns generated by three groupby aggregations: `COUNT_VALID`, `MEAN`, and `M2`. The output of
+ * this aggregation is also a structs column containing the merged values of all those aggregations,
+ * which are all required for recursively merging of `M2` values.
  */
 template <typename Base = aggregation>
 std::unique_ptr<Base> make_merge_m2_aggregation();
diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp
index 59c40c24198..4e60d8d3f7d 100644
--- a/cpp/src/groupby/sort/aggregate.cpp
+++ b/cpp/src/groupby/sort/aggregate.cpp
@@ -493,10 +493,15 @@ void aggregate_result_functor::operator()<aggregation::MERGE_SETS>(aggregation c
  * @brief Perform merging for the M2 values that correspond to the same key value.
  *
  * The partial results input to this aggregation is a structs column with children are columns
- * generated by three other groupby aggregations: `M2`, `MEAN`, `COUNT_VALID` performed
- * on partitioned datasets. After distributedly computed, the results output from these aggregations
- * are (vertically) concatenated before assembling into a structs column given as the values column
- * for this aggregation.
+ * generated by three other groupby aggregations: `COUNT_VALID`, `MEAN`, and `M2` that were
+ * performed on partitioned datasets. After distributedly computed, the results output from these
+ * aggregations are (vertically) concatenated before assembling into a structs column given as the
+ * values column for this aggregation.
+ *
+ * For recursive merging of `M2` values, the aggregations values of all input (`COUNT_VALID`,
+ * `MEAN`, and `M2`) are all merged and stored in the output of this aggregation. As such, the
+ * output will be a structs column containing children columns of merged `COUNT_VALID`, `MEAN`, and
+ * `M2` values.
  *
  * The values of M2 are merged following the parallel algorithm described here:
  * https://www.wikiwand.com/en/Algorithms_for_calculating_variance#/Parallel_algorithm
diff --git a/cpp/src/groupby/sort/group_reductions.hpp b/cpp/src/groupby/sort/group_reductions.hpp
index d3e043a0ac6..2b88317fcc3 100644
--- a/cpp/src/groupby/sort/group_reductions.hpp
+++ b/cpp/src/groupby/sort/group_reductions.hpp
@@ -424,9 +424,13 @@ std::unique_ptr<column> group_merge_lists(column_view const& values,
  *
  * Merging M2 values require accessing to partial M2 values and also groupwise means and group valid
  * counts. Thus, the input to this aggregation need to be a structs column containing tuples of
- * groupwise `(M2_value, mean, valid_count)`.
+ * groupwise `(valid_count, mean, M2_value)`.
  *
- * @param values Grouped values (tuples of groupwise `(M2_value, mean, valid_count)`) to merge M2.
+ * This aggregation not only merges the partial results of `M2` but also merged all the partial
+ * results of input aggregations (`COUNT_VALID`, `MEAN`, and `M2`). As such, the output will be a
+ * structs column containing children columns of merged `COUNT_VALID`, `MEAN`, and `M2` values.
+ *
+ * @param values Grouped values (tuples of groupwise `(valid_count, mean, M2_value)`) to merge.
  * @param group_offsets Offsets of groups' starting points within @p values.
  * @param num_groups Number of groups.
  * @param mr Device memory resource used to allocate the returned column's device memory

From 1d54ef56e07e6e35d851793c5e75282bd14dd82b Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Fri, 2 Jul 2021 13:33:22 -0600
Subject: [PATCH 07/19] Fix `MERGE_M2`implementation

---
 .../cudf/detail/aggregation/aggregation.hpp   |   4 +-
 cpp/src/groupby/sort/group_merge_m2.cu        | 143 +++++++++++++-----
 2 files changed, 103 insertions(+), 44 deletions(-)

diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp
index c51eeb6bc7a..10d9d8c1b92 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.hpp
+++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp
@@ -1044,10 +1044,10 @@ struct target_type_impl<Source, aggregation::MERGE_SETS> {
   using type = cudf::list_view;
 };
 
-// Always use `double` for MERGE_M2
+// Always use struct for MERGE_M2
 template <typename SourceType>
 struct target_type_impl<SourceType, aggregation::MERGE_M2> {
-  using type = double;
+  using type = cudf::struct_view;
 };
 
 /**
diff --git a/cpp/src/groupby/sort/group_merge_m2.cu b/cpp/src/groupby/sort/group_merge_m2.cu
index 8176c2ea712..3752b5b1d92 100644
--- a/cpp/src/groupby/sort/group_merge_m2.cu
+++ b/cpp/src/groupby/sort/group_merge_m2.cu
@@ -34,15 +34,20 @@ namespace groupby {
 namespace detail {
 
 namespace {
+/**
+ * @brief Functor to accumulate (merge) all partial results corresponding to the same key into a
+ * final result storing in its member variables. It performs merging for the partial results of
+ * `COUNT_VALID`, `MEAN`, and `M2` at the same time.
+ */
 template <class ResultType>
 struct accumulate_fn {
-  ResultType M2_a;
-  ResultType mean_a;
   size_type n_a;
+  ResultType mean_a;
+  ResultType M2_a;
 
-  void __device__ operator()(ResultType const M2_b,
+  void __device__ operator()(size_type const n_b,
                              ResultType const mean_b,
-                             size_type const n_b) noexcept
+                             ResultType const M2_b) noexcept
   {
     if (n_b == 0) { return; }
 
@@ -55,6 +60,57 @@ struct accumulate_fn {
   }
 };
 
+/**
+ * @brief Functor to merge partial results of `COUNT_VALID`, `MEAN`, and `M2` aggregations
+ * for a given group (key) index.
+ */
+template <class ResultType>
+struct merge_fn {
+  size_type const* const d_offsets;
+  size_type const* const d_counts;
+  ResultType const* const d_means;
+  ResultType const* const d_M2s;
+
+  auto __device__ operator()(size_type const group_idx) noexcept
+  {
+    auto const start_idx = d_offsets[group_idx], end_idx = d_offsets[group_idx + 1];
+
+    // This case should never happen, because all groups are non-empty due to the given input.
+    // Here just to make sure we cover this case.
+    if (start_idx == end_idx) {
+      return thrust::make_tuple(size_type{0}, ResultType{0}, ResultType{0}, int8_t{0});
+    }
+
+    // Firstly, this stores (valid_count, mean, M2) of the first partial result.
+    // Then, it accumulates (merges) the remaining partial results into it.
+    // Note that, if `n_a == 0` then `mean_a` and `M2_a` will be null.
+    // Thus, in such situations, we need to set zero for them before accumulating partial results.
+    auto const n_a    = d_counts[start_idx];
+    auto const mean_a = n_a > 0 ? d_means[start_idx] : ResultType{0};
+    auto const M2_a   = n_a > 0 ? d_M2s[start_idx] : ResultType{0};
+    auto accumulator  = accumulate_fn<ResultType>{n_a, mean_a, M2_a};
+
+    for (auto idx = start_idx + 1; idx < end_idx; ++idx) {
+      // if `n_b > 0` then we must have `d_means[idx] != null` and `d_M2s[idx] != null`.
+      // if `n_b == 0` then `mean_b` and `M2_b` will be null.
+      // In such situations, we need to set zero for them before merging (all zero partial results
+      // will not change the final output).
+      auto const n_b    = d_counts[idx];
+      auto const mean_b = n_b > 0 ? d_means[idx] : ResultType{0};
+      auto const M2_b   = n_b > 0 ? d_M2s[idx] : ResultType{0};
+      accumulator(n_b, mean_b, M2_b);
+    }
+
+    // If there are all nulls in the partial results (i.e., sum of valid counts is
+    // zero), then the output is null.
+    auto const is_valid = int8_t{accumulator.n_a > 0};
+
+    return accumulator.n_a > 0
+             ? thrust::make_tuple(accumulator.n_a, accumulator.mean_a, accumulator.M2_a, is_valid)
+             : thrust::make_tuple(size_type{0}, ResultType{0}, ResultType{0}, is_valid);
+  }
+};
+
 }  // namespace
 
 std::unique_ptr<column> group_merge_m2(column_view const& values,
@@ -71,54 +127,57 @@ std::unique_ptr<column> group_merge_m2(column_view const& values,
   using ResultType = id_to_type<type_id::FLOAT64>;
   static_assert(
     std::is_same_v<cudf::detail::target_type_t<ResultType, aggregation::Kind::M2>, ResultType>);
-  CUDF_EXPECTS(values.child(0).type().id() == type_to_id<ResultType>() &&
+  CUDF_EXPECTS(values.child(0).type().id() == type_id::INT32 &&
                  values.child(1).type().id() == type_to_id<ResultType>() &&
-                 values.child(2).type().id() == type_id::INT32,
+                 values.child(2).type().id() == type_to_id<ResultType>(),
                "Input to `group_merge_m2` must be a structs column having children columns "
                "containing tuples of groupwise (M2_value, mean, valid_count).");
 
-  auto result = make_numeric_column(
+  auto result_counts = make_numeric_column(
+    data_type(type_to_id<size_type>()), num_groups, mask_state::UNALLOCATED, stream, mr);
+  auto result_means = make_numeric_column(
     data_type(type_to_id<ResultType>()), num_groups, mask_state::UNALLOCATED, stream, mr);
-
-  auto const M2_values   = values.child(0);
+  auto result_M2s = make_numeric_column(
+    data_type(type_to_id<ResultType>()), num_groups, mask_state::UNALLOCATED, stream, mr);
+  auto validities = rmm::device_uvector<int8_t>(num_groups, stream);
+
+  // Perform merging for all the aggregations. Their output (and their validity data) are written
+  // out concurrently through an output zip iterator.
+  using IteratorTuple = thrust::tuple<size_type*, ResultType*, ResultType*, int8_t*>;
+  using ZipIterator   = thrust::zip_iterator<IteratorTuple>;
+  auto const out_iter =
+    ZipIterator{thrust::make_tuple(result_counts->mutable_view().template data<size_type>(),
+                                   result_means->mutable_view().template data<ResultType>(),
+                                   result_M2s->mutable_view().template data<ResultType>(),
+                                   validities.begin())};
+
+  auto const count_valid = values.child(0);
   auto const mean_values = values.child(1);
-  auto const count_valid = values.child(2);
+  auto const M2_values   = values.child(2);
   auto const iter        = thrust::make_counting_iterator<size_type>(0);
-  auto validities        = rmm::device_uvector<int8_t>(num_groups, stream);
-
-  thrust::transform(rmm::exec_policy(stream),
-                    iter,
-                    iter + num_groups,
-                    result->mutable_view().data<ResultType>(),
-                    [d_M2      = M2_values.template begin<ResultType>(),
-                     d_mean    = mean_values.template begin<ResultType>(),
-                     d_count   = count_valid.template begin<size_type>(),
-                     d_offsets = group_offsets.begin(),
-                     d_valid   = validities.begin()] __device__(auto const group_idx) {
-                      auto const start_idx = d_offsets[group_idx],
-                                 end_idx   = d_offsets[group_idx + 1];
-
-                      // Firstly, this stores (M2, mean, valid_count) of the first partial result.
-                      // Then, merge all the following partial results into it.
-                      auto accumulator = accumulate_fn<ResultType>{
-                        d_M2[start_idx], d_mean[start_idx], d_count[start_idx]};
-
-                      for (auto idx = start_idx + 1; idx < end_idx; ++idx) {
-                        auto const n_b    = d_count[idx];
-                        auto const M2_b   = n_b > 0 ? d_M2[idx] : ResultType{0};
-                        auto const mean_b = n_b > 0 ? d_mean[idx] : ResultType{0};
-                        accumulator(M2_b, mean_b, n_b);
-                      }
-
-                      // If there are all nulls in the partial results (i.e., sum of valid counts is
-                      // zero), then output a null.
-                      d_valid[group_idx] = accumulator.n_a > 0;
-                      return accumulator.n_a > 0 ? accumulator.M2_a : ResultType{0};
-                    });
 
+  auto const fn = merge_fn<ResultType>{group_offsets.begin(),
+                                       count_valid.template begin<size_type>(),
+                                       mean_values.template begin<ResultType>(),
+                                       M2_values.template begin<ResultType>()};
+  thrust::transform(rmm::exec_policy(stream), iter, iter + num_groups, out_iter, fn);
+
+  // Generate bitmask for the output.
+  // Only mean and M2 values can be nullable.
   auto [null_mask, null_count] = cudf::detail::valid_if(
     validities.begin(), validities.end(), thrust::identity<int8_t>{}, stream, mr);
-  if (null_count > 0) { result->set_null_mask(null_mask, null_count); }
+  if (null_count > 0) {
+    result_means->set_null_mask(null_mask, null_count);           // copy null_mask
+    result_M2s->set_null_mask(std::move(null_mask), null_count);  // take over null_mask
+  }
+
+  // Output is a structs column containing the merged values of `COUNT_VALID`, `MEAN`, and `M2`.
+  std::vector<std::unique_ptr<column>> out_columns;
+  out_columns.emplace_back(std::move(result_counts));
+  out_columns.emplace_back(std::move(result_means));
+  out_columns.emplace_back(std::move(result_M2s));
+  auto result = cudf::make_structs_column(
+    num_groups, std::move(out_columns), 0, rmm::device_buffer{0, stream, mr}, stream, mr);
 
   return result;
 }

From 96dc79216948f2d57aea1e87e7b1dfa4697469a3 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Fri, 2 Jul 2021 13:33:35 -0600
Subject: [PATCH 08/19] Finish unit tests for `MERGE_M2`

---
 cpp/tests/groupby/merge_m2_tests.cpp | 360 +++++++++++++++++++++++++++
 1 file changed, 360 insertions(+)
 create mode 100644 cpp/tests/groupby/merge_m2_tests.cpp

diff --git a/cpp/tests/groupby/merge_m2_tests.cpp b/cpp/tests/groupby/merge_m2_tests.cpp
new file mode 100644
index 00000000000..36e1eb584cb
--- /dev/null
+++ b/cpp/tests/groupby/merge_m2_tests.cpp
@@ -0,0 +1,360 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/aggregation.hpp>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/table/table_view.hpp>
+
+using namespace cudf::test::iterators;
+
+namespace {
+constexpr bool print_all{false};                                 // For debugging
+constexpr int32_t null{0};                                       // Mark for null elements
+constexpr double NaN{std::numeric_limits<double>::quiet_NaN()};  // Mark for NaN double elements
+
+template <class T>
+using keys_col = cudf::test::fixed_width_column_wrapper<T, int32_t>;
+
+template <class T>
+using vals_col = cudf::test::fixed_width_column_wrapper<T>;
+
+using counts_col = cudf::test::fixed_width_column_wrapper<int32_t>;
+
+template <class T>
+using means_col = cudf::test::fixed_width_column_wrapper<T>;
+
+template <class T>
+using M2s_col = cudf::test::fixed_width_column_wrapper<T>;
+
+using structs_col = cudf::test::structs_column_wrapper;
+using vcol_views  = std::vector<cudf::column_view>;
+
+/**
+ * @brief Compute groupwise `COUNT_VALID`, `MEAN`, `M2` aggregations for the given values
+ *        columns.
+ * @return A pair of unique keys column and a structs column containing the computed values of
+ *         `COUNT_VALID`, `MEAN`, `M2`.
+ */
+auto compute_partial_results(cudf::column_view const& keys, cudf::column_view const& values)
+{
+  std::vector<cudf::groupby::aggregation_request> requests;
+  requests.emplace_back(cudf::groupby::aggregation_request());
+  requests[0].values = values;
+  requests[0].aggregations.emplace_back(cudf::make_count_aggregation());
+  requests[0].aggregations.emplace_back(cudf::make_mean_aggregation());
+  requests[0].aggregations.emplace_back(cudf::make_m2_aggregation());
+
+  auto gb_obj                  = cudf::groupby::groupby(cudf::table_view({keys}));
+  auto [out_keys, out_results] = gb_obj.aggregate(requests);
+
+  auto const num_output_rows = out_keys->num_rows();
+  return std::make_pair(
+    std::move(out_keys->release()[0]),
+    cudf::make_structs_column(
+      num_output_rows, std::move(out_results[0].results), 0, rmm::device_buffer{}));
+}
+
+/**
+ * @brief Perform merging for partial results of M2 aggregations.
+ *
+ * @return A pair of unique keys column and a structs column containing the merged values of
+ *         `COUNT_VALID`, `MEAN`, `M2`.
+ */
+auto merge_M2(vcol_views const& keys_cols, vcol_views const& values_cols)
+{
+  // Append all the keys and values together.
+  auto const keys   = cudf::concatenate(keys_cols);
+  auto const values = cudf::concatenate(values_cols);
+
+  std::vector<cudf::groupby::aggregation_request> requests;
+  requests.emplace_back(cudf::groupby::aggregation_request());
+  requests[0].values = *values;
+  requests[0].aggregations.emplace_back(cudf::make_merge_m2_aggregation());
+
+  auto gb_obj = cudf::groupby::groupby(cudf::table_view({*keys}));
+  auto result = gb_obj.aggregate(requests);
+  return std::make_pair(std::move(result.first->release()[0]),
+                        std::move(result.second[0].results[0]));
+}
+}  // namespace
+
+template <class T>
+struct GroupbyMergeM2TypedTest : public cudf::test::BaseFixture {
+};
+
+using TestTypes =
+  cudf::test::Concat<cudf::test::IntegralTypesNotBool, cudf::test::FloatingPointTypes>;
+TYPED_TEST_SUITE(GroupbyMergeM2TypedTest, TestTypes);
+
+TYPED_TEST(GroupbyMergeM2TypedTest, InvalidInput)
+{
+  using T = TypeParam;
+
+  auto const keys = keys_col<T>{1, 2, 3};
+
+  // The input column must be a structs column.
+  {
+    auto const values = keys_col<T>{1, 2, 3};
+    EXPECT_THROW(merge_M2({keys}, {values}), cudf::logic_error);
+  }
+
+  // The input column must be a structs column having types (int32_t, double, double).
+  {
+    auto vals1      = keys_col<T>{1, 2, 3};
+    auto vals2      = keys_col<T>{1, 2, 3};
+    auto vals3      = keys_col<T>{1, 2, 3};
+    auto const vals = structs_col{vals1, vals2, vals3};
+    EXPECT_THROW(merge_M2({keys}, {vals}), cudf::logic_error);
+  }
+}
+
+TYPED_TEST(GroupbyMergeM2TypedTest, EmptyInput)
+{
+  using T      = TypeParam;
+  using M2_t   = cudf::detail::target_type_t<T, cudf::aggregation::M2>;
+  using mean_t = cudf::detail::target_type_t<T, cudf::aggregation::MEAN>;
+
+  auto const keys = keys_col<T>{};
+  auto vals_count = counts_col{};
+  auto vals_mean  = means_col<mean_t>{};
+  auto vals_M2    = M2s_col<M2_t>{};
+  auto const vals = structs_col{vals_count, vals_mean, vals_M2};
+
+  auto const [out_keys, out_vals] = merge_M2({keys}, {vals});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(keys, *out_keys, print_all);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(vals, *out_vals, print_all);
+}
+
+TYPED_TEST(GroupbyMergeM2TypedTest, SimpleInput)
+{
+  using T = TypeParam;
+  using R = cudf::detail::target_type_t<T, cudf::aggregation::M2>;
+
+  // Full dataset:
+  //
+  // keys = [1, 2, 3, 1, 2, 2, 1, 3, 3, 2]
+  // vals = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
+  //
+  // key = 1: vals = [0, 3, 6]
+  // key = 2: vals = [1, 4, 5, 9]
+  // key = 3: vals = [2, 7, 8]
+
+  // Partitioned datasets:
+  auto const keys1 = keys_col<T>{1, 2, 3};
+  auto const keys2 = keys_col<T>{1, 2, 2};
+  auto const keys3 = keys_col<T>{1, 3, 3, 2};
+
+  auto const vals1 = vals_col<T>{0, 1, 2};
+  auto const vals2 = vals_col<T>{3, 4, 5};
+  auto const vals3 = vals_col<T>{6, 7, 8, 9};
+
+  // Compute partial results (`COUNT_VALID`, `MEAN`, `M2`) of each dataset.
+  // The partial results are also assembled into a structs column.
+  auto const [out1_keys, out1_vals] = compute_partial_results(keys1, vals1);
+  auto const [out2_keys, out2_vals] = compute_partial_results(keys2, vals2);
+  auto const [out3_keys, out3_vals] = compute_partial_results(keys3, vals3);
+
+  // Merge the partial results to the final results.
+  // Merging can be done in just one merge step, or in multiple steps.
+  auto const [out4_keys, out4_vals] =
+    merge_M2(vcol_views{*out1_keys, *out2_keys}, vcol_views{*out1_vals, *out2_vals});
+  auto const [final_keys, final_vals] =
+    merge_M2(vcol_views{*out3_keys, *out3_keys}, vcol_views{*out3_vals, *out4_vals});
+
+  // Get the final M2 values.
+  auto const out_M2s = final_vals->child(2);
+
+  auto const expected_keys = keys_col<T>{1, 2, 3};
+  auto const expected_M2s  = M2s_col<R>{18.0, 32.75, 20.0 + 2.0 / 3.0};
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_keys, *final_keys, print_all);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_M2s, out_M2s, print_all);
+}
+
+TYPED_TEST(GroupbyMergeM2TypedTest, InputHasNulls)
+{
+  using T = TypeParam;
+  using R = cudf::detail::target_type_t<T, cudf::aggregation::M2>;
+
+  // Full dataset:
+  //
+  // keys = [1,    2, 3, 1, 2, 2,    1, null, 3, 2, 4]
+  // vals = [null, 1, 2, 3, 4, null, 6, 7,    8, 9, null]
+  //
+  // key = 1: vals = [null, 3, 6]
+  // key = 2: vals = [1, 4, null, 9]
+  // key = 3: vals = [2, 8]
+  // key = 4: vals = [null]
+
+  // Partitioned datasets:
+  auto const keys1 = keys_col<T>{1, 2, 3, 1};
+  auto const keys2 = keys_col<T>{{2, 2, 1, null}, null_at(3)};
+  auto const keys3 = keys_col<T>{3, 2, 4};
+
+  auto const vals1 = vals_col<T>{{null, 1, 2, 3}, null_at(0)};
+  auto const vals2 = vals_col<T>{{4, null, 6, 7}, null_at(1)};
+  auto const vals3 = vals_col<T>{{8, 9, null}, null_at(2)};
+
+  // Compute partial results (`COUNT_VALID`, `MEAN`, `M2`) of each dataset.
+  // The partial results are also assembled into a structs column.
+  auto const [out1_keys, out1_vals] = compute_partial_results(keys1, vals1);
+  auto const [out2_keys, out2_vals] = compute_partial_results(keys2, vals2);
+  auto const [out3_keys, out3_vals] = compute_partial_results(keys3, vals3);
+
+  // Merge the partial results to the final results.
+  // Merging can be done in just one merge step, or in multiple steps.
+  auto const [out4_keys, out4_vals] =
+    merge_M2(vcol_views{*out1_keys, *out2_keys}, vcol_views{*out1_vals, *out2_vals});
+  auto const [final_keys, final_vals] =
+    merge_M2(vcol_views{*out3_keys, *out4_keys}, vcol_views{*out3_vals, *out4_vals});
+
+  // Get the final M2 values.
+  auto const out_M2s = final_vals->child(2);
+
+  auto const expected_keys = keys_col<T>{1, 2, 3, 4};
+  auto const expected_M2s  = M2s_col<R>{{4.5, 32.0 + 2.0 / 3.0, 18.0, 0.0 /*NULL*/}, null_at(3)};
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_keys, *final_keys, print_all);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_M2s, out_M2s, print_all);
+}
+
+TYPED_TEST(GroupbyMergeM2TypedTest, InputHaveNullsAndNaNs)
+{
+  using T = TypeParam;
+  using R = cudf::detail::target_type_t<T, cudf::aggregation::M2>;
+
+  // Full dataset:
+  //
+  // keys = [4, 3, 1, 2, 3, 1, 2, 2, 1, null, 3, 2, 4, 4]
+  // vals = [null, null, 0.0, 1.0, 2.0, 3.0, 4.0, NaN, 6.0, 7.0, 8.0, 9.0, 10.0, NaN]
+  //
+  // key = 1: vals = [0, 3, 6]
+  // key = 2: vals = [1, 4, NaN, 9]
+  // key = 3: vals = [null, 2, 8]
+  // key = 4: vals = [null, 10, NaN]
+
+  // Partitioned datasets:
+  auto const keys1 = keys_col<T>{4, 3, 1, 2};
+  auto const keys2 = keys_col<T>{3, 1, 2};
+  auto const keys3 = keys_col<T>{{2, 1, null}, null_at(2)};
+  auto const keys4 = keys_col<T>{3, 2, 4, 4};
+
+  auto const vals1 = vals_col<double>{{0.0 /*NULL*/, 0.0 /*NULL*/, 0.0, 1.0}, nulls_at({0, 1})};
+  auto const vals2 = vals_col<double>{2.0, 3.0, 4.0};
+  auto const vals3 = vals_col<double>{NaN, 6.0, 7.0};
+  auto const vals4 = vals_col<double>{8.0, 9.0, 10.0, NaN};
+
+  // Compute partial results (`COUNT_VALID`, `MEAN`, `M2`) of each dataset.
+  // The partial results are also assembled into a structs column.
+  auto const [out1_keys, out1_vals] = compute_partial_results(keys1, vals1);
+  auto const [out2_keys, out2_vals] = compute_partial_results(keys2, vals2);
+  auto const [out3_keys, out3_vals] = compute_partial_results(keys3, vals3);
+  auto const [out4_keys, out4_vals] = compute_partial_results(keys4, vals4);
+
+  // Merge the partial results to the final results.
+  // Merging can be done in just one merge step, or in multiple steps.
+  auto const [out5_keys, out5_vals] =
+    merge_M2(vcol_views{*out1_keys, *out2_keys}, vcol_views{*out1_vals, *out2_vals});
+  auto const [out6_keys, out6_vals] =
+    merge_M2(vcol_views{*out3_keys, *out4_keys}, vcol_views{*out3_vals, *out4_vals});
+
+  auto const [final_keys, final_vals] =
+    merge_M2(vcol_views{*out5_keys, *out6_keys}, vcol_views{*out5_vals, *out6_vals});
+
+  // Get the final M2 values.
+  auto const out_M2s = final_vals->child(2);
+
+  auto const expected_keys = keys_col<T>{1, 2, 3, 4};
+  auto const expected_M2s  = M2s_col<R>{18.0, NaN, 18.0, NaN};
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_keys, *final_keys, print_all);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_M2s, out_M2s, print_all);
+}
+
+TYPED_TEST(GroupbyMergeM2TypedTest, SlicedColumnsInput)
+{
+  using T = TypeParam;
+  using R = cudf::detail::target_type_t<T, cudf::aggregation::M2>;
+
+  // This test should compute M2 aggregation on the same dataset as the InputHaveNullsAndNaNs test.
+  // i.e.:
+  //
+  // keys = [4, 3, 1, 2, 3, 1, 2, 2, 1, null, 3, 2, 4, 4]
+  // vals = [null, null, 0.0, 1.0, 2.0, 3.0, 4.0, NaN, 6.0, 7.0, 8.0, 9.0, 10.0, NaN]
+  //
+  // key = 1: vals = [0, 3, 6]
+  // key = 2: vals = [1, 4, NaN, 9]
+  // key = 3: vals = [null, 2, 8]
+  // key = 4: vals = [null, 10, NaN]
+
+  auto const keys_original = keys_col<T>{{
+                                           1, 2, 3, 4, 5, 1, 2, 3, 4, 5,  // discarded, don't care
+                                           4, 3, 1, 2, 3, 1, 2, 2, 1, null, 3, 2, 4, 4,  // used
+                                           1, 2, 3, 4, 5, 1, 2, 3, 4, 5  // discarded, don't care
+                                         },
+                                         null_at(19)};
+  auto const vals_original = vals_col<double>{
+    {
+      3.0, 2.0,  5.0,  4.0,  6.0, 9.0, 1.0, 0.0,  1.0,  7.0,  // discarded, don't care
+      0.0, 0.0,  0.0,  1.0,  2.0, 3.0, 4.0, NaN,  6.0,  7.0, 8.0, 9.0, 10.0, NaN,  // used
+      9.0, 10.0, 11.0, 12.0, 0.0, 5.0, 1.0, 20.0, 19.0, 15.0  // discarded, don't care
+    },
+    nulls_at({10, 11})};
+
+  // Partitioned datasets, taken from the original dataset in the range [10, 24).
+  auto const keys1 = cudf::slice(keys_original, {10, 14})[0];  // {4, 3, 1, 2}
+  auto const keys2 = cudf::slice(keys_original, {14, 17})[0];  // {3, 1, 2}
+  auto const keys3 = cudf::slice(keys_original, {17, 20})[0];  // {2, 1, null}
+  auto const keys4 = cudf::slice(keys_original, {20, 24})[0];  // {3, 2, 4, 4}
+
+  auto const vals1 = cudf::slice(vals_original, {10, 14})[0];  // {null, null, 0.0, 1.0}
+  auto const vals2 = cudf::slice(vals_original, {14, 17})[0];  // {2.0, 3.0, 4.0}
+  auto const vals3 = cudf::slice(vals_original, {17, 20})[0];  // {NaN, 6.0, 7.0}
+  auto const vals4 = cudf::slice(vals_original, {20, 24})[0];  // {8.0, 9.0, 10.0, NaN}
+
+  // Compute partial results (`COUNT_VALID`, `MEAN`, `M2`) of each dataset.
+  // The partial results are also assembled into a structs column.
+  auto const [out1_keys, out1_vals] = compute_partial_results(keys1, vals1);
+  auto const [out2_keys, out2_vals] = compute_partial_results(keys2, vals2);
+  auto const [out3_keys, out3_vals] = compute_partial_results(keys3, vals3);
+  auto const [out4_keys, out4_vals] = compute_partial_results(keys4, vals4);
+
+  // Merge the partial results to the final results.
+  // Merging can be done in just one merge step, or in multiple steps.
+  auto const [out5_keys, out5_vals] =
+    merge_M2(vcol_views{*out1_keys, *out2_keys}, vcol_views{*out1_vals, *out2_vals});
+  auto const [out6_keys, out6_vals] =
+    merge_M2(vcol_views{*out3_keys, *out4_keys}, vcol_views{*out3_vals, *out4_vals});
+
+  auto const [final_keys, final_vals] =
+    merge_M2(vcol_views{*out5_keys, *out6_keys}, vcol_views{*out5_vals, *out6_vals});
+
+  // Get the final M2 values.
+  auto const out_M2s = final_vals->child(2);
+
+  auto const expected_keys = keys_col<T>{1, 2, 3, 4};
+  auto const expected_M2s  = M2s_col<R>{18.0, NaN, 18.0, NaN};
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_keys, *final_keys, print_all);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_M2s, out_M2s, print_all);
+}

From 99ab9a15d927cb8f2596470ed522eb977b5bb712 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Fri, 2 Jul 2021 14:02:21 -0600
Subject: [PATCH 09/19] Fix copyright header

---
 cpp/src/groupby/sort/group_m2.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/groupby/sort/group_m2.cu b/cpp/src/groupby/sort/group_m2.cu
index 533a8b93d2e..f71111bf027 100644
--- a/cpp/src/groupby/sort/group_m2.cu
+++ b/cpp/src/groupby/sort/group_m2.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.

From 0ac08fe55631fe10c414dc29b2f9756094c07e22 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Fri, 2 Jul 2021 14:12:30 -0600
Subject: [PATCH 10/19] Rename functor

---
 cpp/src/groupby/sort/group_m2.cu | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/cpp/src/groupby/sort/group_m2.cu b/cpp/src/groupby/sort/group_m2.cu
index f71111bf027..71260acc165 100644
--- a/cpp/src/groupby/sort/group_m2.cu
+++ b/cpp/src/groupby/sort/group_m2.cu
@@ -55,12 +55,12 @@ struct m2_transform {
 };
 
 template <typename ResultType, typename Iterator>
-void reduce_by_key_fn(column_device_view const& values,
-                      Iterator values_iter,
-                      cudf::device_span<size_type const> group_labels,
-                      ResultType const* d_means,
-                      ResultType* d_result,
-                      rmm::cuda_stream_view stream)
+void compute_m2_fn(column_device_view const& values,
+                   Iterator values_iter,
+                   cudf::device_span<size_type const> group_labels,
+                   ResultType const* d_means,
+                   ResultType* d_result,
+                   rmm::cuda_stream_view stream)
 {
   auto const var_iter = cudf::detail::make_counting_transform_iterator(
     size_type{0},
@@ -95,11 +95,11 @@ struct m2_functor {
 
     if (!cudf::is_dictionary(values.type())) {
       auto const values_iter = d_values.begin<T>();
-      reduce_by_key_fn(d_values, values_iter, group_labels, d_means, d_result, stream);
+      compute_m2_fn(d_values, values_iter, group_labels, d_means, d_result, stream);
     } else {
       auto const values_iter =
         cudf::dictionary::detail::make_dictionary_iterator<T>(*values_dv_ptr);
-      reduce_by_key_fn(d_values, values_iter, group_labels, d_means, d_result, stream);
+      compute_m2_fn(d_values, values_iter, group_labels, d_means, d_result, stream);
     }
 
     // M2 column values should have the same bitmask as means's.

From 7863b6f18081c283bb99be51f4412c065cdca3c6 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Tue, 6 Jul 2021 10:07:14 -0600
Subject: [PATCH 11/19] Rewrite the merge functor, adding `partial_result`
 struct to store intermediate merging result

---
 cpp/src/groupby/sort/group_merge_m2.cu | 109 +++++++++++++------------
 1 file changed, 58 insertions(+), 51 deletions(-)

diff --git a/cpp/src/groupby/sort/group_merge_m2.cu b/cpp/src/groupby/sort/group_merge_m2.cu
index 3752b5b1d92..120fe41a314 100644
--- a/cpp/src/groupby/sort/group_merge_m2.cu
+++ b/cpp/src/groupby/sort/group_merge_m2.cu
@@ -32,31 +32,38 @@
 namespace cudf {
 namespace groupby {
 namespace detail {
-
 namespace {
+/**
+ * @brief Struct to store partial results for merging.
+ */
+template <class result_type>
+struct partial_result {
+  size_type count;
+  result_type mean;
+  result_type M2;
+};
+
 /**
  * @brief Functor to accumulate (merge) all partial results corresponding to the same key into a
- * final result storing in its member variables. It performs merging for the partial results of
+ * final result storing in a member variable. It performs merging for the partial results of
  * `COUNT_VALID`, `MEAN`, and `M2` at the same time.
  */
-template <class ResultType>
+template <class result_type>
 struct accumulate_fn {
-  size_type n_a;
-  ResultType mean_a;
-  ResultType M2_a;
+  partial_result<result_type> merge_vals;
 
-  void __device__ operator()(size_type const n_b,
-                             ResultType const mean_b,
-                             ResultType const M2_b) noexcept
+  void __device__ operator()(partial_result<result_type> const& partial_vals) noexcept
   {
-    if (n_b == 0) { return; }
-
-    auto const n_ab  = n_a + n_b;
-    auto const delta = mean_b - mean_a;
-    M2_a +=
-      M2_b + (delta * delta) * static_cast<ResultType>(n_a) * static_cast<ResultType>(n_b) / n_ab;
-    mean_a = (mean_a * n_a + mean_b * n_b) / n_ab;
-    n_a    = n_ab;
+    if (partial_vals.count == 0) { return; }
+
+    auto const n_ab  = merge_vals.count + partial_vals.count;
+    auto const delta = partial_vals.mean - merge_vals.mean;
+    merge_vals.M2 += partial_vals.M2 + (delta * delta) *
+                                         static_cast<result_type>(merge_vals.count) *
+                                         static_cast<result_type>(partial_vals.count) / n_ab;
+    merge_vals.mean =
+      (merge_vals.mean * merge_vals.count + partial_vals.mean * partial_vals.count) / n_ab;
+    merge_vals.count = n_ab;
   }
 };
 
@@ -64,50 +71,50 @@ struct accumulate_fn {
  * @brief Functor to merge partial results of `COUNT_VALID`, `MEAN`, and `M2` aggregations
  * for a given group (key) index.
  */
-template <class ResultType>
+template <class result_type>
 struct merge_fn {
   size_type const* const d_offsets;
   size_type const* const d_counts;
-  ResultType const* const d_means;
-  ResultType const* const d_M2s;
+  result_type const* const d_means;
+  result_type const* const d_M2s;
 
   auto __device__ operator()(size_type const group_idx) noexcept
   {
     auto const start_idx = d_offsets[group_idx], end_idx = d_offsets[group_idx + 1];
 
-    // This case should never happen, because all groups are non-empty due to the given input.
-    // Here just to make sure we cover this case.
+    // This case should never happen, because all groups are non-empty as the results of
+    // aggregation. Here we just to make sure we cover this case.
     if (start_idx == end_idx) {
-      return thrust::make_tuple(size_type{0}, ResultType{0}, ResultType{0}, int8_t{0});
-    }
-
-    // Firstly, this stores (valid_count, mean, M2) of the first partial result.
-    // Then, it accumulates (merges) the remaining partial results into it.
-    // Note that, if `n_a == 0` then `mean_a` and `M2_a` will be null.
-    // Thus, in such situations, we need to set zero for them before accumulating partial results.
-    auto const n_a    = d_counts[start_idx];
-    auto const mean_a = n_a > 0 ? d_means[start_idx] : ResultType{0};
-    auto const M2_a   = n_a > 0 ? d_M2s[start_idx] : ResultType{0};
-    auto accumulator  = accumulate_fn<ResultType>{n_a, mean_a, M2_a};
-
-    for (auto idx = start_idx + 1; idx < end_idx; ++idx) {
-      // if `n_b > 0` then we must have `d_means[idx] != null` and `d_M2s[idx] != null`.
-      // if `n_b == 0` then `mean_b` and `M2_b` will be null.
-      // In such situations, we need to set zero for them before merging (all zero partial results
-      // will not change the final output).
-      auto const n_b    = d_counts[idx];
-      auto const mean_b = n_b > 0 ? d_means[idx] : ResultType{0};
-      auto const M2_b   = n_b > 0 ? d_M2s[idx] : ResultType{0};
-      accumulator(n_b, mean_b, M2_b);
+      return thrust::make_tuple(size_type{0}, result_type{0}, result_type{0}, int8_t{0});
     }
 
-    // If there are all nulls in the partial results (i.e., sum of valid counts is
-    // zero), then the output is null.
-    auto const is_valid = int8_t{accumulator.n_a > 0};
-
-    return accumulator.n_a > 0
-             ? thrust::make_tuple(accumulator.n_a, accumulator.mean_a, accumulator.M2_a, is_valid)
-             : thrust::make_tuple(size_type{0}, ResultType{0}, ResultType{0}, is_valid);
+    // If `(n = d_counts[idx]) > 0` then `d_means[idx] != null` and `d_M2s[idx] != null`.
+    // Otherwise (`n == 0`), these value (mean and M2) will always be nulls.
+    // In such cases, reading `mean` and `M2` from memory will return garbage values.
+    // By setting these values to zero when `n == 0`, we can safely merge the all-zero tuple without
+    // affecting the final result.
+    auto get_partial_result = [&] __device__(size_type idx) {
+      {
+        auto const n = d_counts[idx];
+        return n > 0 ? partial_result<result_type>{n, d_means[idx], d_M2s[idx]}
+                     : partial_result<result_type>{size_type{0}, result_type{0}, result_type{0}};
+      };
+    };
+
+    // Firstly, store tuple(count, mean, M2) of the first partial result in an accumulator.
+    auto accumulator = accumulate_fn<result_type>{get_partial_result(start_idx)};
+
+    // Then, accumulate (merge) the remaining partial results into that accumulator.
+    for (auto idx = start_idx + 1; idx < end_idx; ++idx) { accumulator(get_partial_result(idx)); }
+
+    // Get the final result after merging.
+    auto const& merge_vals = accumulator.merge_vals;
+
+    // If there are all nulls in the partial results (i.e., sum of all valid counts is
+    // zero), then the output is a null.
+    auto const is_valid = int8_t{merge_vals.count > 0};
+
+    return thrust::make_tuple(merge_vals.count, merge_vals.mean, merge_vals.M2, is_valid);
   }
 };
 
@@ -163,7 +170,7 @@ std::unique_ptr<column> group_merge_m2(column_view const& values,
   thrust::transform(rmm::exec_policy(stream), iter, iter + num_groups, out_iter, fn);
 
   // Generate bitmask for the output.
-  // Only mean and M2 values can be nullable.
+  // Only mean and M2 values can be nullable. Count column must be non-nullable.
   auto [null_mask, null_count] = cudf::detail::valid_if(
     validities.begin(), validities.end(), thrust::identity<int8_t>{}, stream, mr);
   if (null_count > 0) {

From 0ab2c18bcd6ad1db3c570082604cdbad6e4dc4cf Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Tue, 6 Jul 2021 10:07:22 -0600
Subject: [PATCH 12/19] Rewrite doxygen

---
 cpp/include/cudf/aggregation.hpp | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/cpp/include/cudf/aggregation.hpp b/cpp/include/cudf/aggregation.hpp
index e309c80a50a..9ab5739b0b6 100644
--- a/cpp/include/cudf/aggregation.hpp
+++ b/cpp/include/cudf/aggregation.hpp
@@ -164,9 +164,11 @@ std::unique_ptr<Base> make_mean_aggregation();
 /**
  * @brief Factory to create a M2 aggregation
  *
- * A M2 aggregation is groupwise sum of squares of differences from the group mean. It produces the
- * intermediate values that are used to compute variance and standard deviation in distributed
- * computing.
+ * A M2 aggregation is groupwise sum of squares of differences from the group mean. That is:
+ *  `M2(group) = SUM((x - MEAN(group)) * (x - MEAN(group)), for all x in group)`.
+ *
+ * This aggregation produces the intermediate values that are used to compute variance and standard
+ * deviation in distributed computing.
  */
 template <typename Base = aggregation>
 std::unique_ptr<Base> make_m2_aggregation();
@@ -350,6 +352,10 @@ std::unique_ptr<Base> make_merge_sets_aggregation(null_equality nulls_equal = nu
  * columns generated by three groupby aggregations: `COUNT_VALID`, `MEAN`, and `M2`. The output of
  * this aggregation is also a structs column containing the merged values of all those aggregations,
  * which are all required for recursively merging of `M2` values.
+ *
+ * The partial results of `M2` aggregation are expected to be all non-negative numbers, since they
+ * are expected to be output from `M2` aggregation. However, this will not be checked due to
+ * performance reason.
  */
 template <typename Base = aggregation>
 std::unique_ptr<Base> make_merge_m2_aggregation();

From 90d984a9166dba44d6cc79f579ed2b3b10f4aa16 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Tue, 6 Jul 2021 10:08:11 -0600
Subject: [PATCH 13/19] Add a unit test when the input column column is a
 structs column with number of children is not 3

---
 cpp/tests/groupby/merge_m2_tests.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/cpp/tests/groupby/merge_m2_tests.cpp b/cpp/tests/groupby/merge_m2_tests.cpp
index 36e1eb584cb..1af6f73f90b 100644
--- a/cpp/tests/groupby/merge_m2_tests.cpp
+++ b/cpp/tests/groupby/merge_m2_tests.cpp
@@ -118,6 +118,14 @@ TYPED_TEST(GroupbyMergeM2TypedTest, InvalidInput)
     EXPECT_THROW(merge_M2({keys}, {values}), cudf::logic_error);
   }
 
+  // The input column must be a structs column having 3 children.
+  {
+    auto vals1      = keys_col<T>{1, 2, 3};
+    auto vals2      = vals_col<double>{1.0, 2.0, 3.0};
+    auto const vals = structs_col{vals1, vals2};
+    EXPECT_THROW(merge_M2({keys}, {vals}), cudf::logic_error);
+  }
+
   // The input column must be a structs column having types (int32_t, double, double).
   {
     auto vals1      = keys_col<T>{1, 2, 3};

From 9967ec27863e53a2ea5fb55292e7c1c91ba90bd9 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Tue, 6 Jul 2021 10:20:16 -0600
Subject: [PATCH 14/19] Add unit tests for the cases when the input values
 column has negative numbers

---
 cpp/tests/groupby/m2_tests.cpp       | 23 +++++++++++--
 cpp/tests/groupby/merge_m2_tests.cpp | 50 ++++++++++++++++++++++++++--
 2 files changed, 69 insertions(+), 4 deletions(-)

diff --git a/cpp/tests/groupby/m2_tests.cpp b/cpp/tests/groupby/m2_tests.cpp
index 656a5af09df..e9a3ed6119a 100644
--- a/cpp/tests/groupby/m2_tests.cpp
+++ b/cpp/tests/groupby/m2_tests.cpp
@@ -57,8 +57,8 @@ template <class T>
 struct GroupbyM2TypedTest : public cudf::test::BaseFixture {
 };
 
-using TestTypes =
-  cudf::test::Concat<cudf::test::IntegralTypesNotBool, cudf::test::FloatingPointTypes>;
+using TestTypes = cudf::test::Concat<cudf::test::Types<int8_t, int16_t, int32_t, int64_t>,
+                                     cudf::test::FloatingPointTypes>;
 TYPED_TEST_SUITE(GroupbyM2TypedTest, TestTypes);
 
 TYPED_TEST(GroupbyM2TypedTest, EmptyInput)
@@ -126,6 +126,25 @@ TYPED_TEST(GroupbyM2TypedTest, SimpleInput)
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_M2s, *out_M2s, print_all);
 }
 
+TYPED_TEST(GroupbyM2TypedTest, SimpleInputHavingNegativeValues)
+{
+  using T = TypeParam;
+  using R = cudf::detail::target_type_t<T, cudf::aggregation::M2>;
+
+  // key = 1: vals = [0,  3, -6]
+  // key = 2: vals = [1, -4, -5, 9]
+  // key = 3: vals = [-2, 7, -8]
+  auto const keys = keys_col<T>{1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
+  auto const vals = vals_col<T>{0, 1, -2, 3, -4, -5, -6, 7, -8, 9};
+
+  auto const [out_keys, out_M2s] = compute_M2(keys, vals);
+  auto const expected_keys       = keys_col<T>{1, 2, 3};
+  auto const expected_M2s        = M2s_col<R>{42.0, 122.75, 114.0};
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_keys, *out_keys, print_all);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_M2s, *out_M2s, print_all);
+}
+
 TYPED_TEST(GroupbyM2TypedTest, ValuesHaveNulls)
 {
   using T = TypeParam;
diff --git a/cpp/tests/groupby/merge_m2_tests.cpp b/cpp/tests/groupby/merge_m2_tests.cpp
index 1af6f73f90b..92a17bfa7ba 100644
--- a/cpp/tests/groupby/merge_m2_tests.cpp
+++ b/cpp/tests/groupby/merge_m2_tests.cpp
@@ -102,8 +102,8 @@ template <class T>
 struct GroupbyMergeM2TypedTest : public cudf::test::BaseFixture {
 };
 
-using TestTypes =
-  cudf::test::Concat<cudf::test::IntegralTypesNotBool, cudf::test::FloatingPointTypes>;
+using TestTypes = cudf::test::Concat<cudf::test::Types<int8_t, int16_t, int32_t, int64_t>,
+                                     cudf::test::FloatingPointTypes>;
 TYPED_TEST_SUITE(GroupbyMergeM2TypedTest, TestTypes);
 
 TYPED_TEST(GroupbyMergeM2TypedTest, InvalidInput)
@@ -199,6 +199,52 @@ TYPED_TEST(GroupbyMergeM2TypedTest, SimpleInput)
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_M2s, out_M2s, print_all);
 }
 
+TYPED_TEST(GroupbyMergeM2TypedTest, SimpleInputHavingNegativeValues)
+{
+  using T = TypeParam;
+  using R = cudf::detail::target_type_t<T, cudf::aggregation::M2>;
+
+  // Full dataset:
+  //
+  // keys = [1, 2,  3, 1,  2,  2,  1, 3,  3, 2]
+  // vals = [0, 1, -2, 3, -4, -5, -6, 7, -8, 9]
+  //
+  // key = 1: vals = [0,  3, -6]
+  // key = 2: vals = [1, -4, -5, 9]
+  // key = 3: vals = [-2, 7, -8]
+
+  // Partitioned datasets:
+  auto const keys1 = keys_col<T>{1, 2, 3};
+  auto const keys2 = keys_col<T>{1, 2, 2};
+  auto const keys3 = keys_col<T>{1, 3, 3, 2};
+
+  auto const vals1 = vals_col<T>{0, 1, -2};
+  auto const vals2 = vals_col<T>{3, -4, -5};
+  auto const vals3 = vals_col<T>{-6, 7, -8, 9};
+
+  // Compute partial results (`COUNT_VALID`, `MEAN`, `M2`) of each dataset.
+  // The partial results are also assembled into a structs column.
+  auto const [out1_keys, out1_vals] = compute_partial_results(keys1, vals1);
+  auto const [out2_keys, out2_vals] = compute_partial_results(keys2, vals2);
+  auto const [out3_keys, out3_vals] = compute_partial_results(keys3, vals3);
+
+  // Merge the partial results to the final results.
+  // Merging can be done in just one merge step, or in multiple steps.
+  auto const [out4_keys, out4_vals] =
+    merge_M2(vcol_views{*out1_keys, *out2_keys}, vcol_views{*out1_vals, *out2_vals});
+  auto const [final_keys, final_vals] =
+    merge_M2(vcol_views{*out3_keys, *out3_keys}, vcol_views{*out3_vals, *out4_vals});
+
+  // Get the final M2 values.
+  auto const out_M2s = final_vals->child(2);
+
+  auto const expected_keys = keys_col<T>{1, 2, 3};
+  auto const expected_M2s  = M2s_col<R>{42.0, 122.75, 114.0};
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_keys, *final_keys, print_all);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_M2s, out_M2s, print_all);
+}
+
 TYPED_TEST(GroupbyMergeM2TypedTest, InputHasNulls)
 {
   using T = TypeParam;

From f3d0a3ca1738b7254d8a71ac4985e02ad26b160c Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Tue, 6 Jul 2021 10:29:19 -0600
Subject: [PATCH 15/19] Fix comments

---
 cpp/tests/groupby/m2_tests.cpp       | 19 ++++++++++---------
 cpp/tests/groupby/merge_m2_tests.cpp | 19 ++++++++++---------
 2 files changed, 20 insertions(+), 18 deletions(-)

diff --git a/cpp/tests/groupby/m2_tests.cpp b/cpp/tests/groupby/m2_tests.cpp
index e9a3ed6119a..7611dce2271 100644
--- a/cpp/tests/groupby/m2_tests.cpp
+++ b/cpp/tests/groupby/m2_tests.cpp
@@ -216,17 +216,18 @@ TYPED_TEST(GroupbyM2TypedTest, SlicedColumnsInput)
   // key = 3: vals = [null, 2, 8]
   // key = 4: vals = [null, 10, NaN]
 
-  auto const keys_original = keys_col<T>{{
-                                           1, 2, 3, 4, 5, 1, 2, 3, 4, 5,  // discarded, don't care
-                                           4, 3, 1, 2, 3, 1, 2, 2, 1, null, 3, 2, 4, 4,  // used
-                                           1, 2, 3, 4, 5, 1, 2, 3, 4, 5  // discarded, don't care
-                                         },
-                                         null_at(19)};
+  auto const keys_original =
+    keys_col<T>{{
+                  1, 2, 3, 4, 5, 1, 2, 3, 4, 5,                 // will not use, don't care
+                  4, 3, 1, 2, 3, 1, 2, 2, 1, null, 3, 2, 4, 4,  // use this
+                  1, 2, 3, 4, 5, 1, 2, 3, 4, 5                  // will not use, don't care
+                },
+                null_at(19)};
   auto const vals_original = vals_col<double>{
     {
-      3.0, 2.0,  5.0,  4.0,  6.0, 9.0, 1.0, 0.0,  1.0,  7.0,  // discarded, don't care
-      0.0, 0.0,  0.0,  1.0,  2.0, 3.0, 4.0, NaN,  6.0,  7.0, 8.0, 9.0, 10.0, NaN,  // used
-      9.0, 10.0, 11.0, 12.0, 0.0, 5.0, 1.0, 20.0, 19.0, 15.0  // discarded, don't care
+      3.0, 2.0,  5.0,  4.0,  6.0, 9.0, 1.0, 0.0,  1.0,  7.0,  // will not use, don't care
+      0.0, 0.0,  0.0,  1.0,  2.0, 3.0, 4.0, NaN,  6.0,  7.0, 8.0, 9.0, 10.0, NaN,  // use this
+      9.0, 10.0, 11.0, 12.0, 0.0, 5.0, 1.0, 20.0, 19.0, 15.0  // will not use, don't care
     },
     nulls_at({10, 11})};
 
diff --git a/cpp/tests/groupby/merge_m2_tests.cpp b/cpp/tests/groupby/merge_m2_tests.cpp
index 92a17bfa7ba..d34bbf3d60f 100644
--- a/cpp/tests/groupby/merge_m2_tests.cpp
+++ b/cpp/tests/groupby/merge_m2_tests.cpp
@@ -361,17 +361,18 @@ TYPED_TEST(GroupbyMergeM2TypedTest, SlicedColumnsInput)
   // key = 3: vals = [null, 2, 8]
   // key = 4: vals = [null, 10, NaN]
 
-  auto const keys_original = keys_col<T>{{
-                                           1, 2, 3, 4, 5, 1, 2, 3, 4, 5,  // discarded, don't care
-                                           4, 3, 1, 2, 3, 1, 2, 2, 1, null, 3, 2, 4, 4,  // used
-                                           1, 2, 3, 4, 5, 1, 2, 3, 4, 5  // discarded, don't care
-                                         },
-                                         null_at(19)};
+  auto const keys_original =
+    keys_col<T>{{
+                  1, 2, 3, 4, 5, 1, 2, 3, 4, 5,                 // will not use, don't care
+                  4, 3, 1, 2, 3, 1, 2, 2, 1, null, 3, 2, 4, 4,  // use this
+                  1, 2, 3, 4, 5, 1, 2, 3, 4, 5                  // will not use, don't care
+                },
+                null_at(19)};
   auto const vals_original = vals_col<double>{
     {
-      3.0, 2.0,  5.0,  4.0,  6.0, 9.0, 1.0, 0.0,  1.0,  7.0,  // discarded, don't care
-      0.0, 0.0,  0.0,  1.0,  2.0, 3.0, 4.0, NaN,  6.0,  7.0, 8.0, 9.0, 10.0, NaN,  // used
-      9.0, 10.0, 11.0, 12.0, 0.0, 5.0, 1.0, 20.0, 19.0, 15.0  // discarded, don't care
+      3.0, 2.0,  5.0,  4.0,  6.0, 9.0, 1.0, 0.0,  1.0,  7.0,  // will not use, don't care
+      0.0, 0.0,  0.0,  1.0,  2.0, 3.0, 4.0, NaN,  6.0,  7.0, 8.0, 9.0, 10.0, NaN,  // use this
+      9.0, 10.0, 11.0, 12.0, 0.0, 5.0, 1.0, 20.0, 19.0, 15.0  // will not use, don't care
     },
     nulls_at({10, 11})};
 

From bbb961fd0c5c54b9be777392fecb79c91634980e Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Tue, 6 Jul 2021 10:52:21 -0600
Subject: [PATCH 16/19] Rewrite unit tests, separating multiple steps merging
 and one step merging

---
 cpp/tests/groupby/merge_m2_tests.cpp | 177 ++++++++++++++++++---------
 1 file changed, 121 insertions(+), 56 deletions(-)

diff --git a/cpp/tests/groupby/merge_m2_tests.cpp b/cpp/tests/groupby/merge_m2_tests.cpp
index d34bbf3d60f..2f5f5f3b351 100644
--- a/cpp/tests/groupby/merge_m2_tests.cpp
+++ b/cpp/tests/groupby/merge_m2_tests.cpp
@@ -176,6 +176,10 @@ TYPED_TEST(GroupbyMergeM2TypedTest, SimpleInput)
   auto const vals2 = vals_col<T>{3, 4, 5};
   auto const vals3 = vals_col<T>{6, 7, 8, 9};
 
+  // The expected results to validate.
+  auto const expected_keys = keys_col<T>{1, 2, 3};
+  auto const expected_M2s  = M2s_col<R>{18.0, 32.75, 20.0 + 2.0 / 3.0};
+
   // Compute partial results (`COUNT_VALID`, `MEAN`, `M2`) of each dataset.
   // The partial results are also assembled into a structs column.
   auto const [out1_keys, out1_vals] = compute_partial_results(keys1, vals1);
@@ -184,19 +188,28 @@ TYPED_TEST(GroupbyMergeM2TypedTest, SimpleInput)
 
   // Merge the partial results to the final results.
   // Merging can be done in just one merge step, or in multiple steps.
-  auto const [out4_keys, out4_vals] =
-    merge_M2(vcol_views{*out1_keys, *out2_keys}, vcol_views{*out1_vals, *out2_vals});
-  auto const [final_keys, final_vals] =
-    merge_M2(vcol_views{*out3_keys, *out3_keys}, vcol_views{*out3_vals, *out4_vals});
 
-  // Get the final M2 values.
-  auto const out_M2s = final_vals->child(2);
+  // Multiple steps merging:
+  {
+    auto const [out4_keys, out4_vals] =
+      merge_M2(vcol_views{*out1_keys, *out2_keys}, vcol_views{*out1_vals, *out2_vals});
+    auto const [final_keys, final_vals] =
+      merge_M2(vcol_views{*out3_keys, *out4_keys}, vcol_views{*out3_vals, *out4_vals});
+
+    auto const out_M2s = final_vals->child(2);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_keys, *final_keys, print_all);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_M2s, out_M2s, print_all);
+  }
 
-  auto const expected_keys = keys_col<T>{1, 2, 3};
-  auto const expected_M2s  = M2s_col<R>{18.0, 32.75, 20.0 + 2.0 / 3.0};
+  // One step merging:
+  {
+    auto const [final_keys, final_vals] = merge_M2(vcol_views{*out1_keys, *out2_keys, *out3_keys},
+                                                   vcol_views{*out1_vals, *out2_vals, *out3_vals});
 
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_keys, *final_keys, print_all);
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_M2s, out_M2s, print_all);
+    auto const out_M2s = final_vals->child(2);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_keys, *final_keys, print_all);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_M2s, out_M2s, print_all);
+  }
 }
 
 TYPED_TEST(GroupbyMergeM2TypedTest, SimpleInputHavingNegativeValues)
@@ -222,6 +235,10 @@ TYPED_TEST(GroupbyMergeM2TypedTest, SimpleInputHavingNegativeValues)
   auto const vals2 = vals_col<T>{3, -4, -5};
   auto const vals3 = vals_col<T>{-6, 7, -8, 9};
 
+  // The expected results to validate.
+  auto const expected_keys = keys_col<T>{1, 2, 3};
+  auto const expected_M2s  = M2s_col<R>{42.0, 122.75, 114.0};
+
   // Compute partial results (`COUNT_VALID`, `MEAN`, `M2`) of each dataset.
   // The partial results are also assembled into a structs column.
   auto const [out1_keys, out1_vals] = compute_partial_results(keys1, vals1);
@@ -230,19 +247,28 @@ TYPED_TEST(GroupbyMergeM2TypedTest, SimpleInputHavingNegativeValues)
 
   // Merge the partial results to the final results.
   // Merging can be done in just one merge step, or in multiple steps.
-  auto const [out4_keys, out4_vals] =
-    merge_M2(vcol_views{*out1_keys, *out2_keys}, vcol_views{*out1_vals, *out2_vals});
-  auto const [final_keys, final_vals] =
-    merge_M2(vcol_views{*out3_keys, *out3_keys}, vcol_views{*out3_vals, *out4_vals});
 
-  // Get the final M2 values.
-  auto const out_M2s = final_vals->child(2);
+  // Multiple steps merging:
+  {
+    auto const [out4_keys, out4_vals] =
+      merge_M2(vcol_views{*out1_keys, *out2_keys}, vcol_views{*out1_vals, *out2_vals});
+    auto const [final_keys, final_vals] =
+      merge_M2(vcol_views{*out3_keys, *out4_keys}, vcol_views{*out3_vals, *out4_vals});
+
+    auto const out_M2s = final_vals->child(2);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_keys, *final_keys, print_all);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_M2s, out_M2s, print_all);
+  }
 
-  auto const expected_keys = keys_col<T>{1, 2, 3};
-  auto const expected_M2s  = M2s_col<R>{42.0, 122.75, 114.0};
+  // One step merging:
+  {
+    auto const [final_keys, final_vals] = merge_M2(vcol_views{*out1_keys, *out2_keys, *out3_keys},
+                                                   vcol_views{*out1_vals, *out2_vals, *out3_vals});
 
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_keys, *final_keys, print_all);
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_M2s, out_M2s, print_all);
+    auto const out_M2s = final_vals->child(2);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_keys, *final_keys, print_all);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_M2s, out_M2s, print_all);
+  }
 }
 
 TYPED_TEST(GroupbyMergeM2TypedTest, InputHasNulls)
@@ -269,6 +295,10 @@ TYPED_TEST(GroupbyMergeM2TypedTest, InputHasNulls)
   auto const vals2 = vals_col<T>{{4, null, 6, 7}, null_at(1)};
   auto const vals3 = vals_col<T>{{8, 9, null}, null_at(2)};
 
+  // The expected results to validate.
+  auto const expected_keys = keys_col<T>{1, 2, 3, 4};
+  auto const expected_M2s  = M2s_col<R>{{4.5, 32.0 + 2.0 / 3.0, 18.0, 0.0 /*NULL*/}, null_at(3)};
+
   // Compute partial results (`COUNT_VALID`, `MEAN`, `M2`) of each dataset.
   // The partial results are also assembled into a structs column.
   auto const [out1_keys, out1_vals] = compute_partial_results(keys1, vals1);
@@ -277,19 +307,28 @@ TYPED_TEST(GroupbyMergeM2TypedTest, InputHasNulls)
 
   // Merge the partial results to the final results.
   // Merging can be done in just one merge step, or in multiple steps.
-  auto const [out4_keys, out4_vals] =
-    merge_M2(vcol_views{*out1_keys, *out2_keys}, vcol_views{*out1_vals, *out2_vals});
-  auto const [final_keys, final_vals] =
-    merge_M2(vcol_views{*out3_keys, *out4_keys}, vcol_views{*out3_vals, *out4_vals});
 
-  // Get the final M2 values.
-  auto const out_M2s = final_vals->child(2);
+  // Multiple steps merging:
+  {
+    auto const [out4_keys, out4_vals] =
+      merge_M2(vcol_views{*out1_keys, *out2_keys}, vcol_views{*out1_vals, *out2_vals});
+    auto const [final_keys, final_vals] =
+      merge_M2(vcol_views{*out3_keys, *out4_keys}, vcol_views{*out3_vals, *out4_vals});
+
+    auto const out_M2s = final_vals->child(2);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_keys, *final_keys, print_all);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_M2s, out_M2s, print_all);
+  }
 
-  auto const expected_keys = keys_col<T>{1, 2, 3, 4};
-  auto const expected_M2s  = M2s_col<R>{{4.5, 32.0 + 2.0 / 3.0, 18.0, 0.0 /*NULL*/}, null_at(3)};
+  // One step merging:
+  {
+    auto const [final_keys, final_vals] = merge_M2(vcol_views{*out1_keys, *out2_keys, *out3_keys},
+                                                   vcol_views{*out1_vals, *out2_vals, *out3_vals});
 
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_keys, *final_keys, print_all);
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_M2s, out_M2s, print_all);
+    auto const out_M2s = final_vals->child(2);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_keys, *final_keys, print_all);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_M2s, out_M2s, print_all);
+  }
 }
 
 TYPED_TEST(GroupbyMergeM2TypedTest, InputHaveNullsAndNaNs)
@@ -318,6 +357,10 @@ TYPED_TEST(GroupbyMergeM2TypedTest, InputHaveNullsAndNaNs)
   auto const vals3 = vals_col<double>{NaN, 6.0, 7.0};
   auto const vals4 = vals_col<double>{8.0, 9.0, 10.0, NaN};
 
+  // The expected results to validate.
+  auto const expected_keys = keys_col<T>{1, 2, 3, 4};
+  auto const expected_M2s  = M2s_col<R>{18.0, NaN, 18.0, NaN};
+
   // Compute partial results (`COUNT_VALID`, `MEAN`, `M2`) of each dataset.
   // The partial results are also assembled into a structs column.
   auto const [out1_keys, out1_vals] = compute_partial_results(keys1, vals1);
@@ -327,22 +370,31 @@ TYPED_TEST(GroupbyMergeM2TypedTest, InputHaveNullsAndNaNs)
 
   // Merge the partial results to the final results.
   // Merging can be done in just one merge step, or in multiple steps.
-  auto const [out5_keys, out5_vals] =
-    merge_M2(vcol_views{*out1_keys, *out2_keys}, vcol_views{*out1_vals, *out2_vals});
-  auto const [out6_keys, out6_vals] =
-    merge_M2(vcol_views{*out3_keys, *out4_keys}, vcol_views{*out3_vals, *out4_vals});
 
-  auto const [final_keys, final_vals] =
-    merge_M2(vcol_views{*out5_keys, *out6_keys}, vcol_views{*out5_vals, *out6_vals});
-
-  // Get the final M2 values.
-  auto const out_M2s = final_vals->child(2);
+  // Multiple steps merging:
+  {
+    auto const [out5_keys, out5_vals] =
+      merge_M2(vcol_views{*out1_keys, *out2_keys}, vcol_views{*out1_vals, *out2_vals});
+    auto const [out6_keys, out6_vals] =
+      merge_M2(vcol_views{*out3_keys, *out4_keys}, vcol_views{*out3_vals, *out4_vals});
+    auto const [final_keys, final_vals] =
+      merge_M2(vcol_views{*out5_keys, *out6_keys}, vcol_views{*out5_vals, *out6_vals});
+
+    auto const out_M2s = final_vals->child(2);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_keys, *final_keys, print_all);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_M2s, out_M2s, print_all);
+  }
 
-  auto const expected_keys = keys_col<T>{1, 2, 3, 4};
-  auto const expected_M2s  = M2s_col<R>{18.0, NaN, 18.0, NaN};
+  // One step merging:
+  {
+    auto const [final_keys, final_vals] =
+      merge_M2(vcol_views{*out1_keys, *out2_keys, *out3_keys, *out4_keys},
+               vcol_views{*out1_vals, *out2_vals, *out3_vals, *out4_vals});
 
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_keys, *final_keys, print_all);
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_M2s, out_M2s, print_all);
+    auto const out_M2s = final_vals->child(2);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_keys, *final_keys, print_all);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_M2s, out_M2s, print_all);
+  }
 }
 
 TYPED_TEST(GroupbyMergeM2TypedTest, SlicedColumnsInput)
@@ -387,6 +439,10 @@ TYPED_TEST(GroupbyMergeM2TypedTest, SlicedColumnsInput)
   auto const vals3 = cudf::slice(vals_original, {17, 20})[0];  // {NaN, 6.0, 7.0}
   auto const vals4 = cudf::slice(vals_original, {20, 24})[0];  // {8.0, 9.0, 10.0, NaN}
 
+  // The expected results to validate.
+  auto const expected_keys = keys_col<T>{1, 2, 3, 4};
+  auto const expected_M2s  = M2s_col<R>{18.0, NaN, 18.0, NaN};
+
   // Compute partial results (`COUNT_VALID`, `MEAN`, `M2`) of each dataset.
   // The partial results are also assembled into a structs column.
   auto const [out1_keys, out1_vals] = compute_partial_results(keys1, vals1);
@@ -396,20 +452,29 @@ TYPED_TEST(GroupbyMergeM2TypedTest, SlicedColumnsInput)
 
   // Merge the partial results to the final results.
   // Merging can be done in just one merge step, or in multiple steps.
-  auto const [out5_keys, out5_vals] =
-    merge_M2(vcol_views{*out1_keys, *out2_keys}, vcol_views{*out1_vals, *out2_vals});
-  auto const [out6_keys, out6_vals] =
-    merge_M2(vcol_views{*out3_keys, *out4_keys}, vcol_views{*out3_vals, *out4_vals});
-
-  auto const [final_keys, final_vals] =
-    merge_M2(vcol_views{*out5_keys, *out6_keys}, vcol_views{*out5_vals, *out6_vals});
 
-  // Get the final M2 values.
-  auto const out_M2s = final_vals->child(2);
+  // Multiple steps merging:
+  {
+    auto const [out5_keys, out5_vals] =
+      merge_M2(vcol_views{*out1_keys, *out2_keys}, vcol_views{*out1_vals, *out2_vals});
+    auto const [out6_keys, out6_vals] =
+      merge_M2(vcol_views{*out3_keys, *out4_keys}, vcol_views{*out3_vals, *out4_vals});
+    auto const [final_keys, final_vals] =
+      merge_M2(vcol_views{*out5_keys, *out6_keys}, vcol_views{*out5_vals, *out6_vals});
+
+    auto const out_M2s = final_vals->child(2);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_keys, *final_keys, print_all);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_M2s, out_M2s, print_all);
+  }
 
-  auto const expected_keys = keys_col<T>{1, 2, 3, 4};
-  auto const expected_M2s  = M2s_col<R>{18.0, NaN, 18.0, NaN};
+  // One step merging:
+  {
+    auto const [final_keys, final_vals] =
+      merge_M2(vcol_views{*out1_keys, *out2_keys, *out3_keys, *out4_keys},
+               vcol_views{*out1_vals, *out2_vals, *out3_vals, *out4_vals});
 
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_keys, *final_keys, print_all);
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_M2s, out_M2s, print_all);
+    auto const out_M2s = final_vals->child(2);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_keys, *final_keys, print_all);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_M2s, out_M2s, print_all);
+  }
 }

From 26e17f6771e7699edaf35a120290991279c6880e Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Tue, 6 Jul 2021 11:08:31 -0600
Subject: [PATCH 17/19] Change `ResultType` to `result_type` to enforce name
 consistency

---
 cpp/src/groupby/sort/group_m2.cu       | 15 +++++++-----
 cpp/src/groupby/sort/group_merge_m2.cu | 32 +++++++++++++-------------
 2 files changed, 25 insertions(+), 22 deletions(-)

diff --git a/cpp/src/groupby/sort/group_m2.cu b/cpp/src/groupby/sort/group_m2.cu
index 71260acc165..a72f6c6f647 100644
--- a/cpp/src/groupby/sort/group_m2.cu
+++ b/cpp/src/groupby/sort/group_m2.cu
@@ -44,7 +44,7 @@ struct m2_transform {
 
   __device__ ResultType operator()(size_type const idx) const noexcept
   {
-    if (d_values.is_null(idx)) return 0.0;
+    if (d_values.is_null(idx)) { return 0.0; }
 
     auto const x         = static_cast<ResultType>(values_iter[idx]);
     auto const group_idx = d_group_labels[idx];
@@ -84,14 +84,17 @@ struct m2_functor {
     rmm::cuda_stream_view stream,
     rmm::mr::device_memory_resource* mr)
   {
-    using ResultType = cudf::detail::target_type_t<T, aggregation::Kind::M2>;
-    auto result      = make_numeric_column(
-      data_type(type_to_id<ResultType>()), group_means.size(), mask_state::UNALLOCATED, stream, mr);
+    using result_type = cudf::detail::target_type_t<T, aggregation::Kind::M2>;
+    auto result       = make_numeric_column(data_type(type_to_id<result_type>()),
+                                      group_means.size(),
+                                      mask_state::UNALLOCATED,
+                                      stream,
+                                      mr);
 
     auto const values_dv_ptr = column_device_view::create(values, stream);
     auto const d_values      = *values_dv_ptr;
-    auto const d_means       = group_means.data<ResultType>();
-    auto const d_result      = result->mutable_view().data<ResultType>();
+    auto const d_means       = group_means.data<result_type>();
+    auto const d_result      = result->mutable_view().data<result_type>();
 
     if (!cudf::is_dictionary(values.type())) {
       auto const values_iter = d_values.begin<T>();
diff --git a/cpp/src/groupby/sort/group_merge_m2.cu b/cpp/src/groupby/sort/group_merge_m2.cu
index 120fe41a314..1545f1830f2 100644
--- a/cpp/src/groupby/sort/group_merge_m2.cu
+++ b/cpp/src/groupby/sort/group_merge_m2.cu
@@ -131,42 +131,42 @@ std::unique_ptr<column> group_merge_m2(column_view const& values,
   CUDF_EXPECTS(values.num_children() == 3,
                "Input to `group_merge_m2` must be a structs column having 3 children columns.");
 
-  using ResultType = id_to_type<type_id::FLOAT64>;
+  using result_type = id_to_type<type_id::FLOAT64>;
   static_assert(
-    std::is_same_v<cudf::detail::target_type_t<ResultType, aggregation::Kind::M2>, ResultType>);
+    std::is_same_v<cudf::detail::target_type_t<result_type, aggregation::Kind::M2>, result_type>);
   CUDF_EXPECTS(values.child(0).type().id() == type_id::INT32 &&
-                 values.child(1).type().id() == type_to_id<ResultType>() &&
-                 values.child(2).type().id() == type_to_id<ResultType>(),
+                 values.child(1).type().id() == type_to_id<result_type>() &&
+                 values.child(2).type().id() == type_to_id<result_type>(),
                "Input to `group_merge_m2` must be a structs column having children columns "
                "containing tuples of groupwise (M2_value, mean, valid_count).");
 
   auto result_counts = make_numeric_column(
     data_type(type_to_id<size_type>()), num_groups, mask_state::UNALLOCATED, stream, mr);
   auto result_means = make_numeric_column(
-    data_type(type_to_id<ResultType>()), num_groups, mask_state::UNALLOCATED, stream, mr);
+    data_type(type_to_id<result_type>()), num_groups, mask_state::UNALLOCATED, stream, mr);
   auto result_M2s = make_numeric_column(
-    data_type(type_to_id<ResultType>()), num_groups, mask_state::UNALLOCATED, stream, mr);
+    data_type(type_to_id<result_type>()), num_groups, mask_state::UNALLOCATED, stream, mr);
   auto validities = rmm::device_uvector<int8_t>(num_groups, stream);
 
   // Perform merging for all the aggregations. Their output (and their validity data) are written
   // out concurrently through an output zip iterator.
-  using IteratorTuple = thrust::tuple<size_type*, ResultType*, ResultType*, int8_t*>;
-  using ZipIterator   = thrust::zip_iterator<IteratorTuple>;
+  using iterator_tuple  = thrust::tuple<size_type*, result_type*, result_type*, int8_t*>;
+  using output_iterator = thrust::zip_iterator<iterator_tuple>;
   auto const out_iter =
-    ZipIterator{thrust::make_tuple(result_counts->mutable_view().template data<size_type>(),
-                                   result_means->mutable_view().template data<ResultType>(),
-                                   result_M2s->mutable_view().template data<ResultType>(),
-                                   validities.begin())};
+    output_iterator{thrust::make_tuple(result_counts->mutable_view().template data<size_type>(),
+                                       result_means->mutable_view().template data<result_type>(),
+                                       result_M2s->mutable_view().template data<result_type>(),
+                                       validities.begin())};
 
   auto const count_valid = values.child(0);
   auto const mean_values = values.child(1);
   auto const M2_values   = values.child(2);
   auto const iter        = thrust::make_counting_iterator<size_type>(0);
 
-  auto const fn = merge_fn<ResultType>{group_offsets.begin(),
-                                       count_valid.template begin<size_type>(),
-                                       mean_values.template begin<ResultType>(),
-                                       M2_values.template begin<ResultType>()};
+  auto const fn = merge_fn<result_type>{group_offsets.begin(),
+                                        count_valid.template begin<size_type>(),
+                                        mean_values.template begin<result_type>(),
+                                        M2_values.template begin<result_type>()};
   thrust::transform(rmm::exec_policy(stream), iter, iter + num_groups, out_iter, fn);
 
   // Generate bitmask for the output.

From 18728460d7c1dd88a06e402ac3b77756565d847c Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Tue, 6 Jul 2021 21:05:41 -0600
Subject: [PATCH 18/19] Rewrite doxygen

---
 cpp/include/cudf/aggregation.hpp          | 29 ++++++++++++-----------
 cpp/src/groupby/sort/group_merge_m2.cu    |  2 +-
 cpp/src/groupby/sort/group_reductions.hpp | 14 +++++------
 cpp/tests/groupby/merge_m2_tests.cpp      |  7 +++---
 4 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/cpp/include/cudf/aggregation.hpp b/cpp/include/cudf/aggregation.hpp
index 9ab5739b0b6..a2f59de54db 100644
--- a/cpp/include/cudf/aggregation.hpp
+++ b/cpp/include/cudf/aggregation.hpp
@@ -67,9 +67,9 @@ class aggregation {
     ALL,             ///< all reduction
     SUM_OF_SQUARES,  ///< sum of squares reduction
     MEAN,            ///< arithmetic mean reduction
-    M2,              ///< groupwise sum of squares of differences from the group mean
-    VARIANCE,        ///< groupwise variance
-    STD,             ///< groupwise standard deviation
+    M2,              ///< sum of squares of differences from the mean
+    VARIANCE,        ///< variance
+    STD,             ///< standard deviation
     MEDIAN,          ///< median reduction
     QUANTILE,        ///< compute specified quantile(s)
     ARGMAX,          ///< Index of max element
@@ -164,11 +164,13 @@ std::unique_ptr<Base> make_mean_aggregation();
 /**
  * @brief Factory to create a M2 aggregation
  *
- * A M2 aggregation is groupwise sum of squares of differences from the group mean. That is:
- *  `M2(group) = SUM((x - MEAN(group)) * (x - MEAN(group)), for all x in group)`.
+ * A M2 aggregation is sum of squares of differences from the mean. That is:
+ *  `M2 = SUM((x - MEAN) * (x - MEAN))`.
  *
  * This aggregation produces the intermediate values that are used to compute variance and standard
- * deviation in distributed computing.
+ * deviation across multiple discrete sets. See
+ * `https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm` for more
+ * detail.
  */
 template <typename Base = aggregation>
 std::unique_ptr<Base> make_m2_aggregation();
@@ -347,15 +349,14 @@ std::unique_ptr<Base> make_merge_sets_aggregation(null_equality nulls_equal = nu
 /**
  * @brief Factory to create a MERGE_M2 aggregation
  *
- * This aggregation is designed specifically to accommodate distributed computing of `M2`
- * aggregation. The partial results input to this aggregation is a structs column with children are
- * columns generated by three groupby aggregations: `COUNT_VALID`, `MEAN`, and `M2`. The output of
- * this aggregation is also a structs column containing the merged values of all those aggregations,
- * which are all required for recursively merging of `M2` values.
+ * Merges the results of `M2` aggregations on independent sets into a new `M2` value equivalent to
+ * if a single `M2` aggregation was done across all of the sets at once. This aggregation is only
+ * valid on structs whose members are the result of the `COUNT_VALID`, `MEAN`, and `M2` aggregations
+ * on the same sets. The output of this aggregation is a struct containing the merged `COUNT_VALID`,
+ * `MEAN`, and `M2` aggregations.
  *
- * The partial results of `M2` aggregation are expected to be all non-negative numbers, since they
- * are expected to be output from `M2` aggregation. However, this will not be checked due to
- * performance reason.
+ * The input `M2` aggregation values are expected to be all non-negative numbers, since they
+ * were output from `M2` aggregation.
  */
 template <typename Base = aggregation>
 std::unique_ptr<Base> make_merge_m2_aggregation();
diff --git a/cpp/src/groupby/sort/group_merge_m2.cu b/cpp/src/groupby/sort/group_merge_m2.cu
index 1545f1830f2..15ada3e9d74 100644
--- a/cpp/src/groupby/sort/group_merge_m2.cu
+++ b/cpp/src/groupby/sort/group_merge_m2.cu
@@ -138,7 +138,7 @@ std::unique_ptr<column> group_merge_m2(column_view const& values,
                  values.child(1).type().id() == type_to_id<result_type>() &&
                  values.child(2).type().id() == type_to_id<result_type>(),
                "Input to `group_merge_m2` must be a structs column having children columns "
-               "containing tuples of groupwise (M2_value, mean, valid_count).");
+               "containing tuples of (M2_value, mean, valid_count).");
 
   auto result_counts = make_numeric_column(
     data_type(type_to_id<size_type>()), num_groups, mask_state::UNALLOCATED, stream, mr);
diff --git a/cpp/src/groupby/sort/group_reductions.hpp b/cpp/src/groupby/sort/group_reductions.hpp
index 2b88317fcc3..2770162da2d 100644
--- a/cpp/src/groupby/sort/group_reductions.hpp
+++ b/cpp/src/groupby/sort/group_reductions.hpp
@@ -218,9 +218,9 @@ std::unique_ptr<column> group_count_all(cudf::device_span<size_type const> group
                                         rmm::mr::device_memory_resource* mr);
 
 /**
- * @brief Internal API to calculate groupwise sum of squares of differences from group means.
+ * @brief Internal API to calculate sum of squares of differences from means.
  *
- * If there are only nulls in the group, the output value of that group will be `0`.
+ * If there are only nulls in the group, the output value of that group will be null.
  *
  * @code{.pseudo}
  * values        = [2, 1, 4, -1, -2, <NA>, 4, <NA>]
@@ -420,17 +420,17 @@ std::unique_ptr<column> group_merge_lists(column_view const& values,
  * @brief Internal API to merge grouped M2 values corresponding to the same key.
  *
  * The values of M2 are merged following the parallel algorithm described here:
- * https://www.wikiwand.com/en/Algorithms_for_calculating_variance#/Parallel_algorithm
+ * `https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm`
  *
- * Merging M2 values require accessing to partial M2 values and also groupwise means and group valid
- * counts. Thus, the input to this aggregation need to be a structs column containing tuples of
- * groupwise `(valid_count, mean, M2_value)`.
+ * Merging M2 values require accessing to partial M2 values, means, and valid counts. Thus, the
+ * input to this aggregation need to be a structs column containing tuples of 3 values
+ * `(valid_count, mean, M2)`.
  *
  * This aggregation not only merges the partial results of `M2` but also merged all the partial
  * results of input aggregations (`COUNT_VALID`, `MEAN`, and `M2`). As such, the output will be a
  * structs column containing children columns of merged `COUNT_VALID`, `MEAN`, and `M2` values.
  *
- * @param values Grouped values (tuples of groupwise `(valid_count, mean, M2_value)`) to merge.
+ * @param values Grouped values (tuples of values `(valid_count, mean, M2)`) to merge.
  * @param group_offsets Offsets of groups' starting points within @p values.
  * @param num_groups Number of groups.
  * @param mr Device memory resource used to allocate the returned column's device memory
diff --git a/cpp/tests/groupby/merge_m2_tests.cpp b/cpp/tests/groupby/merge_m2_tests.cpp
index 2f5f5f3b351..63451f9612d 100644
--- a/cpp/tests/groupby/merge_m2_tests.cpp
+++ b/cpp/tests/groupby/merge_m2_tests.cpp
@@ -50,10 +50,9 @@ using structs_col = cudf::test::structs_column_wrapper;
 using vcol_views  = std::vector<cudf::column_view>;
 
 /**
- * @brief Compute groupwise `COUNT_VALID`, `MEAN`, `M2` aggregations for the given values
- *        columns.
+ * @brief Compute `COUNT_VALID`, `MEAN`, `M2` aggregations for the given values columns.
  * @return A pair of unique keys column and a structs column containing the computed values of
- *         `COUNT_VALID`, `MEAN`, `M2`.
+ *         (`COUNT_VALID`, `MEAN`, `M2`).
  */
 auto compute_partial_results(cudf::column_view const& keys, cudf::column_view const& values)
 {
@@ -78,7 +77,7 @@ auto compute_partial_results(cudf::column_view const& keys, cudf::column_view co
  * @brief Perform merging for partial results of M2 aggregations.
  *
  * @return A pair of unique keys column and a structs column containing the merged values of
- *         `COUNT_VALID`, `MEAN`, `M2`.
+ *         (`COUNT_VALID`, `MEAN`, `M2`).
  */
 auto merge_M2(vcol_views const& keys_cols, vcol_views const& values_cols)
 {

From a1d00b1a02ffab11cb4c7cc6997eafb7fd13eee8 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Tue, 6 Jul 2021 21:40:38 -0600
Subject: [PATCH 19/19] Fix formatting

---
 cpp/src/groupby/sort/group_merge_m2.cu | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/cpp/src/groupby/sort/group_merge_m2.cu b/cpp/src/groupby/sort/group_merge_m2.cu
index 15ada3e9d74..4e2a5b68abc 100644
--- a/cpp/src/groupby/sort/group_merge_m2.cu
+++ b/cpp/src/groupby/sort/group_merge_m2.cu
@@ -105,7 +105,9 @@ struct merge_fn {
     auto accumulator = accumulate_fn<result_type>{get_partial_result(start_idx)};
 
     // Then, accumulate (merge) the remaining partial results into that accumulator.
-    for (auto idx = start_idx + 1; idx < end_idx; ++idx) { accumulator(get_partial_result(idx)); }
+    for (auto idx = start_idx + 1; idx < end_idx; ++idx) {
+      accumulator(get_partial_result(idx));
+    }
 
     // Get the final result after merging.
     auto const& merge_vals = accumulator.merge_vals;