From fdb6d34d0800e841de71215622cc59dae76671c1 Mon Sep 17 00:00:00 2001
From: Steve Bronder <sbronder@flatironinstitute.org>
Date: Tue, 12 Sep 2023 17:37:46 -0400
Subject: [PATCH 1/9] Fixes adjoint accumulation for reverse mode where
 aliasing can occur. Creates a assignment op tag that is used by
 adjoint_results to do a += instead of a = into the adjoint matrix

---
 stan/math/opencl/kernel_generator.hpp         |  2 +-
 .../kernel_generator/as_operation_cl.hpp      | 18 ++--
 .../kernel_generator/assignment_ops.hpp       | 71 +++++++++++++++
 stan/math/opencl/kernel_generator/load.hpp    | 18 ++--
 .../kernel_generator/multi_result_kernel.hpp  | 40 +++++++--
 .../opencl/kernel_generator/operation_cl.hpp  | 21 ++++-
 stan/math/opencl/prim/normal_lccdf.hpp        |  9 +-
 stan/math/opencl/rev.hpp                      |  1 +
 stan/math/opencl/rev/adjoint_results.hpp      | 18 ++--
 stan/math/opencl/rev/grad.hpp                 | 31 +++++++
 .../kernel_generator/assignment_ops_test.cpp  | 27 ++++++
 test/unit/math/opencl/rev/add_test.cpp        | 86 +++++--------------
 test/unit/math/opencl/rev/grad_test.cpp       | 37 ++++++++
 13 files changed, 275 insertions(+), 104 deletions(-)
 create mode 100644 stan/math/opencl/kernel_generator/assignment_ops.hpp
 create mode 100644 stan/math/opencl/rev/grad.hpp
 create mode 100644 test/unit/math/opencl/kernel_generator/assignment_ops_test.cpp
 create mode 100644 test/unit/math/opencl/rev/grad_test.cpp
diff --git a/stan/math/opencl/kernel_generator.hpp b/stan/math/opencl/kernel_generator.hpp
index a3cb839d776..fd78ce3f9a8 100644
--- a/stan/math/opencl/kernel_generator.hpp
+++ b/stan/math/opencl/kernel_generator.hpp
@@ -108,7 +108,7 @@
 #include <stan/math/opencl/kernel_generator/as_operation_cl.hpp>
 #include <stan/math/opencl/kernel_generator/name_generator.hpp>
 #include <stan/math/opencl/kernel_generator/type_str.hpp>
-
+#include <stan/math/opencl/kernel_generator/assignment_ops.hpp>
 #include <stan/math/opencl/kernel_generator/as_column_vector_or_scalar.hpp>
 #include <stan/math/opencl/kernel_generator/load.hpp>
 #include <stan/math/opencl/kernel_generator/scalar.hpp>
diff --git a/stan/math/opencl/kernel_generator/as_operation_cl.hpp b/stan/math/opencl/kernel_generator/as_operation_cl.hpp
index c6e5e819174..a7cf2ac7264 100644
--- a/stan/math/opencl/kernel_generator/as_operation_cl.hpp
+++ b/stan/math/opencl/kernel_generator/as_operation_cl.hpp
@@ -2,6 +2,7 @@
 #define STAN_MATH_OPENCL_KERNEL_GENERATOR_AS_OPERATION_CL_HPP
 #ifdef STAN_OPENCL
 
+#include <stan/math/opencl/kernel_generator/assignment_ops.hpp>
 #include <stan/math/opencl/kernel_generator/operation_cl.hpp>
 #include <stan/math/opencl/kernel_generator/load.hpp>
 #include <stan/math/opencl/kernel_generator/scalar.hpp>
@@ -23,7 +24,7 @@ namespace math {
  * @param a an operation
  * @return operation
  */
-template <typename T_operation,
+template <assignment_ops_cl AssignOp = assignment_ops_cl::equals, typename T_operation,
           typename = std::enable_if_t<std::is_base_of<
               operation_cl_base, std::remove_reference_t<T_operation>>::value>>
 inline T_operation&& as_operation_cl(T_operation&& a) {
@@ -37,7 +38,7 @@ inline T_operation&& as_operation_cl(T_operation&& a) {
  * @param a scalar
  * @return \c scalar_ wrapping the input
  */
-template <typename T_scalar, typename = require_arithmetic_t<T_scalar>,
+template <assignment_ops_cl AssignOp = assignment_ops_cl::equals, typename T_scalar, typename = require_arithmetic_t<T_scalar>,
           require_not_same_t<T_scalar, bool>* = nullptr>
 inline scalar_<T_scalar> as_operation_cl(const T_scalar a) {
   return scalar_<T_scalar>(a);
@@ -50,6 +51,7 @@ inline scalar_<T_scalar> as_operation_cl(const T_scalar a) {
  * @param a scalar
  * @return \c scalar_<char> wrapping the input
  */
+template <assignment_ops_cl AssignOp = assignment_ops_cl::equals>
 inline scalar_<char> as_operation_cl(const bool a) { return scalar_<char>(a); }
 
 /**
@@ -59,11 +61,11 @@ inline scalar_<char> as_operation_cl(const bool a) { return scalar_<char>(a); }
  * @param a \c matrix_cl
  * @return \c load_ wrapping the input
  */
-template <typename T_matrix_cl,
+template <assignment_ops_cl AssignOp = assignment_ops_cl::equals, typename T_matrix_cl,
           typename = require_any_t<is_matrix_cl<T_matrix_cl>,
                                    is_arena_matrix_cl<T_matrix_cl>>>
-inline load_<T_matrix_cl> as_operation_cl(T_matrix_cl&& a) {
-  return load_<T_matrix_cl>(std::forward<T_matrix_cl>(a));
+inline load_<T_matrix_cl, AssignOp> as_operation_cl(T_matrix_cl&& a) {
+  return load_<T_matrix_cl, AssignOp>(std::forward<T_matrix_cl>(a));
 }
 
 /**
@@ -74,11 +76,11 @@ inline load_<T_matrix_cl> as_operation_cl(T_matrix_cl&& a) {
  * rvalue reference, the reference is removed, so that a variable of this type
  * actually stores the value.
  */
-template <typename T>
+template <typename T, assignment_ops_cl AssignOp = assignment_ops_cl::equals>
 using as_operation_cl_t = std::conditional_t<
     std::is_lvalue_reference<T>::value,
-    decltype(as_operation_cl(std::declval<T>())),
-    std::remove_reference_t<decltype(as_operation_cl(std::declval<T>()))>>;
+    decltype(as_operation_cl<AssignOp>(std::declval<T>())),
+    std::remove_reference_t<decltype(as_operation_cl<AssignOp>(std::declval<T>()))>>;
 
 /** @}*/
 }  // namespace math
diff --git a/stan/math/opencl/kernel_generator/assignment_ops.hpp b/stan/math/opencl/kernel_generator/assignment_ops.hpp
new file mode 100644
index 00000000000..5e3919ced95
--- /dev/null
+++ b/stan/math/opencl/kernel_generator/assignment_ops.hpp
@@ -0,0 +1,71 @@
+#ifndef STAN_MATH_OPENCL_KERNEL_GENERATOR_ASSIGNMENT_OPS
+#define STAN_MATH_OPENCL_KERNEL_GENERATOR_ASSIGNMENT_OPS
+#ifdef STAN_OPENCL
+#include <stan/math/prim/meta/is_detected.hpp>
+
+namespace stan {
+namespace math {
+
+/**
+ * Ops that decide the type of assignment for LHS operations
+ */
+enum class assignment_ops_cl {equals, plus_equals, minus_equals, divide_equals};
+
+/**
+ * @param value A static constexpr const char* member for printing assignment ops
+ */
+template <assignment_ops_cl assign_op>
+struct assignment_op_str;
+
+template <>
+struct assignment_op_str<assignment_ops_cl::equals> {
+  static constexpr const char* value = " = ";
+};
+
+template <>
+struct assignment_op_str<assignment_ops_cl::plus_equals> {
+  static constexpr const char* value = " += ";
+};
+
+template <>
+struct assignment_op_str<assignment_ops_cl::minus_equals> {
+  static constexpr const char* value = " *= ";
+};
+
+template <>
+struct assignment_op_str<assignment_ops_cl::divide_equals> {
+  static constexpr const char* value = " /= ";
+};
+
+
+namespace internal {
+template <typename, typename = void>
+struct has_assignment_op_str : std::false_type {};
+
+template <typename T>
+struct has_assignment_op_str<T, void_t<decltype(T::assignment_op)>> : std::true_type {};
+
+}  // namespace internal
+
+/**
+ * @tparam T A type that does not have an `assignment_op` static constexpr member type 
+ * @return A constexpr const char* equal to `" = "`
+ */
+template <typename T, std::enable_if_t<!internal::has_assignment_op_str<std::decay_t<T>>::value>* = nullptr>
+inline constexpr const char* assignment_op() noexcept {
+  return " = ";
+}
+
+/**
+ * @tparam T A type that has an `assignment_op` static constexpr member type
+ * @return The types assignment op as a constexpr const char*
+ */
+template <typename T, std::enable_if_t<internal::has_assignment_op_str<T>::value>* = nullptr>
+inline constexpr const char* assignment_op() noexcept {
+  return assignment_op_str<std::decay_t<T>::assignment_op>::value;
+}
+
+}
+}
+#endif
+#endif
diff --git a/stan/math/opencl/kernel_generator/load.hpp b/stan/math/opencl/kernel_generator/load.hpp
index 319557959b6..bd3eaafb1c3 100644
--- a/stan/math/opencl/kernel_generator/load.hpp
+++ b/stan/math/opencl/kernel_generator/load.hpp
@@ -4,6 +4,8 @@
 
 #include <stan/math/opencl/matrix_cl.hpp>
 #include <stan/math/opencl/matrix_cl_view.hpp>
+#include <stan/math/opencl/kernel_generator/assignment_ops.hpp>
+
 #include <stan/math/opencl/kernel_generator/type_str.hpp>
 #include <stan/math/opencl/kernel_generator/name_generator.hpp>
 #include <stan/math/opencl/kernel_generator/operation_cl.hpp>
@@ -23,17 +25,20 @@ namespace math {
 /**
  * Represents an access to a \c matrix_cl in kernel generator expressions
  * @tparam T \c matrix_cl
+ * @tparam AssignOp tells higher level operations whether the final operation should be an assignment or a type of compound assignment.
  */
-template <typename T>
+template <typename T, assignment_ops_cl AssignOp = assignment_ops_cl::equals>
 class load_
-    : public operation_cl_lhs<load_<T>,
+    : public operation_cl_lhs<load_<T, AssignOp>,
                               typename std::remove_reference_t<T>::type> {
  protected:
   T a_;
 
  public:
+
+  static constexpr assignment_ops_cl assignment_op = AssignOp;
   using Scalar = typename std::remove_reference_t<T>::type;
-  using base = operation_cl<load_<T>, Scalar>;
+  using base = operation_cl<load_<T, AssignOp>, Scalar>;
   using base::var_name_;
   static_assert(disjunction<is_matrix_cl<T>, is_arena_matrix_cl<T>>::value,
                 "load_: argument a must be a matrix_cl<T>!");
@@ -51,9 +56,9 @@ class load_
    * Creates a deep copy of this expression.
    * @return copy of \c *this
    */
-  inline load_<T&> deep_copy() & { return load_<T&>(a_); }
-  inline load_<const T&> deep_copy() const& { return load_<const T&>(a_); }
-  inline load_<T> deep_copy() && { return load_<T>(std::forward<T>(a_)); }
+  inline load_<T&, AssignOp> deep_copy() & { return load_<T&, AssignOp>(a_); }
+  inline load_<const T&, AssignOp> deep_copy() const& { return load_<const T&, AssignOp>(a_); }
+  inline load_<T, AssignOp> deep_copy() && { return load_<T, AssignOp>(std::forward<T>(a_)); }
 
   /**
    * Generates kernel code for this expression.
@@ -327,6 +332,7 @@ class load_
     }
   }
 };
+
 /** @}*/
 }  // namespace math
 }  // namespace stan
diff --git a/stan/math/opencl/kernel_generator/multi_result_kernel.hpp b/stan/math/opencl/kernel_generator/multi_result_kernel.hpp
index a82d033c5f3..3b9362ced06 100644
--- a/stan/math/opencl/kernel_generator/multi_result_kernel.hpp
+++ b/stan/math/opencl/kernel_generator/multi_result_kernel.hpp
@@ -5,6 +5,7 @@
 #include <stan/math/prim/err/check_size_match.hpp>
 #include <stan/math/prim/meta/is_kernel_expression.hpp>
 #include <stan/math/opencl/kernel_generator/name_generator.hpp>
+#include <stan/math/opencl/kernel_generator/assignment_ops.hpp>
 #include <stan/math/opencl/kernel_generator/as_operation_cl.hpp>
 #include <stan/math/opencl/kernel_generator/calc_if.hpp>
 #include <stan/math/opencl/kernel_generator/check_cl.hpp>
@@ -334,13 +335,34 @@ class results_cl {
                                         == sizeof...(T_expressions)>>
   void operator+=(const expressions_cl<T_expressions...>& exprs) {
     index_apply<sizeof...(T_expressions)>([this, &exprs](auto... Is) {
-      auto tmp = std::tuple_cat(make_assignment_pair(
+      auto tmp = std::tuple_cat(make_assignment_pair<assignment_ops_cl::plus_equals>(
           std::get<Is>(results_), std::get<Is>(exprs.expressions_))...);
       index_apply<std::tuple_size<decltype(tmp)>::value>(
           [this, &tmp](auto... Is2) {
             assignment_impl(std::make_tuple(std::make_pair(
-                std::get<Is2>(tmp).first,
-                std::get<Is2>(tmp).first + std::get<Is2>(tmp).second)...));
+                std::get<Is2>(tmp).first, std::get<Is2>(tmp).second)...));
+          });
+    });
+  }
+
+    /**
+   * Incrementing \c results_ object by \c expressions_cl object
+   * executes the kernel that evaluates expressions and increments results by
+   * those expressions.
+   * @tparam T_expressions types of expressions
+   * @param exprs expressions
+   */
+  template <typename... T_expressions,
+            typename = std::enable_if_t<sizeof...(T_results)
+                                        == sizeof...(T_expressions)>>
+  void operator-=(const expressions_cl<T_expressions...>& exprs) {
+    index_apply<sizeof...(T_expressions)>([this, &exprs](auto... Is) {
+      auto tmp = std::tuple_cat(make_assignment_pair<assignment_ops_cl::minus_equals>(
+          std::get<Is>(results_), std::get<Is>(exprs.expressions_))...);
+      index_apply<std::tuple_size<decltype(tmp)>::value>(
+          [this, &tmp](auto... Is2) {
+            assignment_impl(std::make_tuple(std::make_pair(
+                std::get<Is2>(tmp).first, std::get<Is2>(tmp).second)...));
           });
     });
   }
@@ -426,7 +448,7 @@ class results_cl {
           + parts.reduction_2d +
           "}\n";
     }
-    return src;
+     return src;
   }
 
   /**
@@ -529,7 +551,7 @@ class results_cl {
    * @param expression expression
    * @return a tuple of pair of result and expression
    */
-  template <typename T_result, typename T_expression,
+  template <assignment_ops_cl AssignOp = assignment_ops_cl::equals, typename T_result, typename T_expression,
             require_all_not_t<is_without_output<T_expression>,
                               conjunction<internal::is_scalar_check<T_result>,
                                           std::is_arithmetic<std::decay_t<
@@ -537,8 +559,8 @@ class results_cl {
   static auto make_assignment_pair(T_result&& result,
                                    T_expression&& expression) {
     return std::make_tuple(
-        std::pair<as_operation_cl_t<T_result>, as_operation_cl_t<T_expression>>(
-            as_operation_cl(std::forward<T_result>(result)),
+        std::pair<as_operation_cl_t<T_result, AssignOp>, as_operation_cl_t<T_expression>>(
+            as_operation_cl<AssignOp>(std::forward<T_result>(result)),
             as_operation_cl(std::forward<T_expression>(expression))));
   }
 
@@ -548,7 +570,7 @@ class results_cl {
    * @param expression expression
    * @return a tuple of pair of result and expression
    */
-  template <typename T_result, typename T_expression,
+  template <assignment_ops_cl AssignOp = assignment_ops_cl::equals, typename T_result, typename T_expression,
             require_t<is_without_output<T_expression>>* = nullptr>
   static auto make_assignment_pair(T_result&& result,
                                    T_expression&& expression) {
@@ -562,7 +584,7 @@ class results_cl {
    * @param pass bool scalar
    * @return an empty tuple
    */
-  template <typename T_check, typename T_pass,
+  template <assignment_ops_cl AssignOp = assignment_ops_cl::equals, typename T_check, typename T_pass,
             require_t<internal::is_scalar_check<T_check>>* = nullptr,
             require_integral_t<T_pass>* = nullptr>
   static std::tuple<> make_assignment_pair(T_check&& result, T_pass&& pass) {
diff --git a/stan/math/opencl/kernel_generator/operation_cl.hpp b/stan/math/opencl/kernel_generator/operation_cl.hpp
index ef2d4977c97..84f026dbdf6 100644
--- a/stan/math/opencl/kernel_generator/operation_cl.hpp
+++ b/stan/math/opencl/kernel_generator/operation_cl.hpp
@@ -4,6 +4,7 @@
 
 #include <stan/math/prim/meta.hpp>
 #include <stan/math/prim/err/check_nonnegative.hpp>
+#include <stan/math/opencl/kernel_generator/assignment_ops.hpp>
 #include <stan/math/opencl/kernel_generator/type_str.hpp>
 #include <stan/math/opencl/kernel_generator/name_generator.hpp>
 #include <stan/math/opencl/matrix_cl_view.hpp>
@@ -74,6 +75,24 @@ struct kernel_parts {
   }
 };
 
+std::ostream& operator<<(std::ostream& os, kernel_parts& parts) {
+      os << "args:" << std::endl;
+      os <<                parts.args.substr(0, parts.args.size() - 2)  << std::endl;
+      os << "Decl:" << std::endl;
+           os << parts.declarations << std::endl;
+           os << "Init:" << std::endl;
+           os << parts.initialization << std::endl;
+           os << "body:" << std::endl;
+           os << parts.body << std::endl;
+           os << "body_suffix:" << std::endl;
+           os << parts.body_suffix << std::endl;
+           os << "reduction_1d:" << std::endl;
+           os << parts.reduction_1d << std::endl;
+           os << "reduction_2d:" << std::endl;
+           os << parts.reduction_2d << std::endl;
+    return os;
+}
+
 /**
  * Base for all kernel generator operations.
  * @tparam Derived derived type
@@ -201,7 +220,7 @@ class operation_cl : public operation_cl_base {
         generated, generated_all, ng, row_index_name, col_index_name, false);
     kernel_parts out_parts = result.get_kernel_parts_lhs(
         generated, generated_all, ng, row_index_name, col_index_name);
-    out_parts.body += " = " + derived().var_name_ + ";\n";
+    out_parts.body += assignment_op<T_result>() + derived().var_name_ + ";\n";
     parts += out_parts;
     return parts;
   }
diff --git a/stan/math/opencl/prim/normal_lccdf.hpp b/stan/math/opencl/prim/normal_lccdf.hpp
index e9e7f97c079..24a49a70f6f 100644
--- a/stan/math/opencl/prim/normal_lccdf.hpp
+++ b/stan/math/opencl/prim/normal_lccdf.hpp
@@ -82,13 +82,12 @@ return_type_t<T_y_cl, T_loc_cl, T_scale_cl> normal_lccdf(
   matrix_cl<double> mu_deriv_cl;
   matrix_cl<double> sigma_deriv_cl;
 
-  results(check_y_not_nan, check_mu_finite, check_sigma_positive, lccdf_cl,
-          y_deriv_cl, mu_deriv_cl, sigma_deriv_cl)
-      = expressions(y_not_nan_expr, mu_finite_expr, sigma_positive_expr,
-                    lccdf_expr, calc_if<!is_constant<T_y_cl>::value>(y_deriv),
+  results(check_y_not_nan, check_mu_finite, check_sigma_positive)
+      = expressions(y_not_nan_expr, mu_finite_expr, sigma_positive_expr);
+  results(lccdf_cl, y_deriv_cl, mu_deriv_cl, sigma_deriv_cl)
+      = expressions(lccdf_expr, calc_if<!is_constant<T_y_cl>::value>(y_deriv),
                     calc_if<!is_constant<T_loc_cl>::value>(mu_deriv),
                     calc_if<!is_constant<T_scale_cl>::value>(sigma_deriv));
-
   T_partials_return lccdf = LOG_HALF + sum(from_matrix_cl(lccdf_cl));
 
   auto ops_partials = make_partials_propagator(y_col, mu_col, sigma_col);
diff --git a/stan/math/opencl/rev.hpp b/stan/math/opencl/rev.hpp
index 7303a5d34e3..f48de0b806f 100644
--- a/stan/math/opencl/rev.hpp
+++ b/stan/math/opencl/rev.hpp
@@ -50,6 +50,7 @@
 #include <stan/math/opencl/rev/fmax.hpp>
 #include <stan/math/opencl/rev/fmin.hpp>
 #include <stan/math/opencl/rev/fmod.hpp>
+#include <stan/math/opencl/rev/grad.hpp>
 #include <stan/math/opencl/rev/hypot.hpp>
 #include <stan/math/opencl/rev/inv.hpp>
 #include <stan/math/opencl/rev/inv_cloglog.hpp>
diff --git a/stan/math/opencl/rev/adjoint_results.hpp b/stan/math/opencl/rev/adjoint_results.hpp
index c42297b418d..1a3cd238ec1 100644
--- a/stan/math/opencl/rev/adjoint_results.hpp
+++ b/stan/math/opencl/rev/adjoint_results.hpp
@@ -41,22 +41,21 @@ class adjoint_results_cl : protected results_cl<T_results...> {
     index_apply<sizeof...(T_expressions)>([&](auto... Is) {
       auto scalars = std::tuple_cat(select_scalar_assignments(
           std::get<Is>(this->results_), std::get<Is>(exprs.expressions_))...);
-      auto nonscalars_tmp = std::tuple_cat(select_nonscalar_assignments(
+      auto nonscalars_tmp = std::tuple_cat(select_nonscalar_plusequals(
           std::get<Is>(this->results_), std::get<Is>(exprs.expressions_))...);
 
       index_apply<std::tuple_size<decltype(nonscalars_tmp)>::value>(
           [&](auto... Is_nonscal) {
             auto nonscalars = std::make_tuple(std::make_pair(
-                std::get<Is_nonscal>(nonscalars_tmp).first,
-                std::get<Is_nonscal>(nonscalars_tmp).first
-                    + std::get<Is_nonscal>(nonscalars_tmp).second)...);
-
+                std::get<Is_nonscal>(nonscalars_tmp).first, std::get<Is_nonscal>(nonscalars_tmp).second)...);
+ 
+ 
             index_apply<std::tuple_size<decltype(scalars)>::value>(
                 [&](auto... Is_scal) {
                   // evaluate all expressions
                   this->assignment_impl(std::tuple_cat(
                       nonscalars,
-                      this->make_assignment_pair(
+                      this->template make_assignment_pair<assignment_ops_cl::plus_equals>(
                           std::get<2>(std::get<Is_scal>(scalars)),
                           sum_2d(std::get<1>(std::get<Is_scal>(scalars))))...));
 
@@ -100,6 +99,7 @@ class adjoint_results_cl : protected results_cl<T_results...> {
     return std::make_tuple();
   }
 
+
   /**
    * Selects assignments that have non-scalar var results.
    * @tparam T_result type of result. This overload is used for non-scalar vars.
@@ -112,9 +112,9 @@ class adjoint_results_cl : protected results_cl<T_results...> {
   template <typename T_result, typename T_expression,
             require_not_stan_scalar_t<T_result>* = nullptr,
             require_st_var<T_result>* = nullptr>
-  auto select_nonscalar_assignments(const T_result& result,
+  auto select_nonscalar_plusequals(T_result&& result,
                                     T_expression&& expression) {
-    return results_cl<T_results...>::make_assignment_pair(
+    return results_cl<T_results...>::template make_assignment_pair<assignment_ops_cl::plus_equals>(
         result.adj(), std::forward<T_expression>(expression));
   }
   /**
@@ -130,7 +130,7 @@ class adjoint_results_cl : protected results_cl<T_results...> {
       typename T_result, typename T_expression,
       std::enable_if_t<is_stan_scalar<T_result>::value
                        || !is_var<scalar_type_t<T_result>>::value>* = nullptr>
-  auto select_nonscalar_assignments(T_result&& result,
+  auto select_nonscalar_plusequals(T_result&& result,
                                     T_expression&& expression) {
     return std::make_tuple();
   }
diff --git a/stan/math/opencl/rev/grad.hpp b/stan/math/opencl/rev/grad.hpp
new file mode 100644
index 00000000000..46b0e7cb0d7
--- /dev/null
+++ b/stan/math/opencl/rev/grad.hpp
@@ -0,0 +1,31 @@
+#ifndef STAN_MATH_OPENCL_REV_FUN_GRAD_HPP
+#define STAN_MATH_OPENCL_REV_FUN_GRAD_HPP
+#ifdef STAN_OPENCL
+#include <stan/math/opencl/rev.hpp>
+
+namespace stan {
+namespace math {
+
+/**
+ * Propagate chain rule to calculate gradients starting from
+ * the specified variable.  Resizes the input vector to be the
+ * correct size.
+ *
+ * The grad() function does not itself recover any memory.  use
+ * <code>recover_memory()</code> or
+ * <code>recover_memory_nested()</code> to recover memory.
+ *
+ * @param[in] v Value of function being differentiated
+ * @param[in] x Variables being differentiated with respect to
+ * @param[out] g Gradient, d/dx v, evaluated at x.
+ */
+inline void grad(var& v, var_value<matrix_cl<double>>& x,
+                 Eigen::VectorXd& g) {
+  grad(v.vi_);
+  g = from_matrix_cl<Eigen::VectorXd>(x.adj());
+}
+
+}  // namespace math
+}  // namespace stan
+#endif
+#endif
diff --git a/test/unit/math/opencl/kernel_generator/assignment_ops_test.cpp b/test/unit/math/opencl/kernel_generator/assignment_ops_test.cpp
new file mode 100644
index 00000000000..d9db2f2a7d2
--- /dev/null
+++ b/test/unit/math/opencl/kernel_generator/assignment_ops_test.cpp
@@ -0,0 +1,27 @@
+#ifdef STAN_OPENCL
+#include <stan/math/prim/fun/Eigen.hpp>
+#include <stan/math/opencl/kernel_generator.hpp>
+#include <stan/math/opencl/matrix_cl.hpp>
+#include <stan/math/opencl/copy.hpp>
+#include <stan/math.hpp>
+#include <test/unit/util.hpp>
+#include <gtest/gtest.h>
+#include <string>
+
+TEST(KernelGenerator, assign_ops) {
+  using stan::math::matrix_cl;
+  using stan::math::var_value;
+  using stan::math::var;
+  using stan::math::to_matrix_cl;
+  Eigen::MatrixXd A = Eigen::MatrixXd::Random(10, 10);
+  Eigen::MatrixXd B = Eigen::MatrixXd::Random(10, 10);
+  Eigen::MatrixXd C = Eigen::MatrixXd::Random(10, 10);
+  C += A + B;
+  matrix_cl<double> A_cl = to_matrix_cl(A);
+  matrix_cl<double> B_cl = to_matrix_cl(B);
+  matrix_cl<double> C_cl = to_matrix_cl(C);
+  results(C_cl) += expressions(A_cl + B_cl);
+}
+
+
+#endif
diff --git a/test/unit/math/opencl/rev/add_test.cpp b/test/unit/math/opencl/rev/add_test.cpp
index 15c4d501b2f..e07c438f828 100644
--- a/test/unit/math/opencl/rev/add_test.cpp
+++ b/test/unit/math/opencl/rev/add_test.cpp
@@ -5,75 +5,31 @@
 #include <gtest/gtest.h>
 #include <algorithm>
 
-auto add_functor
-    = [](const auto& a, const auto& b) { return stan::math::add(a, b).eval(); };
 
-TEST(OpenCLPrim, add_v_small_zero) {
-  stan::math::vector_d d1(3), d2(3);
-  d1 << 1, 2, 3;
-  d2 << 3, 2, 1;
-  stan::math::test::compare_cpu_opencl_prim_rev(add_functor, d1, d2);
 
-  stan::math::vector_d d0(0);
-  stan::math::test::compare_cpu_opencl_prim_rev(add_functor, d0, d0);
-
-  double d3 = 3.0;
-  stan::math::test::compare_cpu_opencl_prim_rev(add_functor, d1, d3);
-  stan::math::test::compare_cpu_opencl_prim_rev(add_functor, d3, d1);
-  stan::math::test::compare_cpu_opencl_prim_rev(add_functor, d3, d0);
-  stan::math::test::compare_cpu_opencl_prim_rev(add_functor, d0, d3);
-}
-
-TEST(OpenCLPrim, add_rv_small_zero) {
-  stan::math::row_vector_d d1(3), d2(3);
-  d1 << 1, 2, 3;
-  d2 << 3, 2, 1;
-  stan::math::test::compare_cpu_opencl_prim_rev(add_functor, d1, d2);
-
-  stan::math::vector_d d0(0);
-  stan::math::test::compare_cpu_opencl_prim_rev(add_functor, d0, d0);
-
-  double d3 = 3.0;
-  stan::math::test::compare_cpu_opencl_prim_rev(add_functor, d1, d3);
-  stan::math::test::compare_cpu_opencl_prim_rev(add_functor, d3, d1);
-  stan::math::test::compare_cpu_opencl_prim_rev(add_functor, d3, d0);
-  stan::math::test::compare_cpu_opencl_prim_rev(add_functor, d0, d3);
-}
-
-TEST(OpenCLPrim, add_m_small_zero) {
-  stan::math::matrix_d d1(3, 3), d2(3, 3);
+TEST(OpenCLPrim, add_aliasing) {
+  stan::math::matrix_d d1(3, 3);
   d1 << 1, 2, 3, 4, 5, 6, 7, 8, 9;
-  d2 << 10, 100, 1000, 0, -10, -12, 2, 4, 8;
-  stan::math::test::compare_cpu_opencl_prim_rev(add_functor, d1, d2);
-
-  stan::math::matrix_d d0(0, 0);
-  stan::math::test::compare_cpu_opencl_prim_rev(add_functor, d0, d0);
-
-  double d3 = 3.0;
-  stan::math::test::compare_cpu_opencl_prim_rev(add_functor, d1, d3);
-  stan::math::test::compare_cpu_opencl_prim_rev(add_functor, d3, d1);
-  stan::math::test::compare_cpu_opencl_prim_rev(add_functor, d3, d0);
-  stan::math::test::compare_cpu_opencl_prim_rev(add_functor, d0, d3);
-}
-
-TEST(OpenCLPrim, add_rev_exceptions) {
   using stan::math::matrix_cl;
-  stan::math::vector_d vd1(2), vd2(3);
-  stan::math::var_value<matrix_cl<double>> vd11 = stan::math::to_matrix_cl(vd1);
-  stan::math::var_value<matrix_cl<double>> vd22 = stan::math::to_matrix_cl(vd2);
-  EXPECT_THROW(stan::math::add(vd11, vd22), std::invalid_argument);
-
-  stan::math::row_vector_d rvd1(2), rvd2(3);
-  stan::math::var_value<matrix_cl<double>> rvd11
-      = stan::math::to_matrix_cl(rvd1);
-  stan::math::var_value<matrix_cl<double>> rvd22
-      = stan::math::to_matrix_cl(rvd2);
-  EXPECT_THROW(stan::math::add(rvd11, rvd22), std::invalid_argument);
-
-  stan::math::matrix_d md1(2, 2), md2(3, 3);
-  stan::math::var_value<matrix_cl<double>> md11 = stan::math::to_matrix_cl(md1);
-  stan::math::var_value<matrix_cl<double>> md22 = stan::math::to_matrix_cl(md2);
-  EXPECT_THROW(stan::math::add(md11, md22), std::invalid_argument);
+  using stan::math::var_value;
+  using stan::math::var;
+  using varmat_cl = var_value<matrix_cl<double>>;
+  varmat_cl d11 = stan::math::to_matrix_cl(d1);
+  // Add the same matrix as the left and right hand side
+  var res = stan::math::sum(stan::math::add(d11, d11));
+  res.grad();
+  // Get back adjoints
+  Eigen::MatrixXd grad_res = stan::math::from_matrix_cl(d11.adj());
+  stan::math::recover_memory();
+  Eigen::Matrix<var, -1, -1> d_host = d1;
+  // Same op as above but on the host
+  var res_host = stan::math::sum(stan::math::add(d_host, d_host));
+  res_host.grad();
+  Eigen::MatrixXd grad_res_host = d_host.adj();
+  std::cout << "OpenCL Adjoints: " << std::endl;
+  std::cout << grad_res << std::endl;
+  std::cout << "CPU Adjoints: " << std::endl;
+  std::cout << grad_res_host << std::endl;
 }
 
 #endif
diff --git a/test/unit/math/opencl/rev/grad_test.cpp b/test/unit/math/opencl/rev/grad_test.cpp
new file mode 100644
index 00000000000..e24a70d26b9
--- /dev/null
+++ b/test/unit/math/opencl/rev/grad_test.cpp
@@ -0,0 +1,37 @@
+#ifdef STAN_OPENCL
+#include <stan/math.hpp>
+#include <test/unit/math/opencl/util.hpp>
+#include <test/unit/util.hpp>
+#include <gtest/gtest.h>
+#include <algorithm>
+
+TEST(OpenCLGradTest, exceptions) {
+  using stan::math::var_value;
+  using stan::math::var;
+  using stan::math::matrix_cl;
+  using stan::math::to_matrix_cl;
+  using varmat_cl = var_value<matrix_cl<double>>;
+  Eigen::VectorXd a(6);
+  a << 1, 2, 3 , 4, 5, 6;
+  varmat_cl a_cl(to_matrix_cl(a));
+  varmat_cl b_cl(a_cl.block(0, 0, 3, 1));
+  varmat_cl c_cl(a_cl.block(3, 0, 3, 1));
+  Eigen::VectorXd ret_grads_cl(6);
+  using stan::math::add;
+  using stan::math::subtract;
+  var ret = stan::math::sum(add(b_cl, b_cl));
+  stan::math::grad(ret, a_cl, ret_grads_cl);
+  std::cout << "opencl grads: \n" << ret_grads_cl << std::endl;
+  stan::math::recover_memory();
+  Eigen::Matrix<var, -1, 1> a_host = a;
+  Eigen::Matrix<var, -1, 1> b_host = a_host.segment(0, 3);
+  Eigen::Matrix<var, -1, 1> c_host = a_host.segment(3, 3);  
+  var ret_host = stan::math::sum(add(b_host, b_host));
+  Eigen::VectorXd ret_grads_host(6);
+  stan::math::grad(ret_host, a_host, ret_grads_host);
+  std::cout << "host grads: \n" << ret_grads_host << std::endl;
+  stan::math::recover_memory();
+}
+
+
+#endif

From bfcd2223b6e2a1993b2f10ce5e2a54e3578845bb Mon Sep 17 00:00:00 2001
From: Steve Bronder <sbronder@flatironinstitute.org>
Date: Wed, 13 Sep 2023 12:27:22 -0400
Subject: [PATCH 2/9] adds tests for the compound assignments

---
 .../kernel_generator/as_operation_cl.hpp      | 10 +--
 .../kernel_generator/assignment_ops.hpp       | 38 +++++------
 stan/math/opencl/kernel_generator/load.hpp    |  4 +-
 .../kernel_generator/multi_result_kernel.hpp  | 66 ++++++++++++++-----
 stan/math/opencl/rev/adjoint_results.hpp      | 14 ++--
 .../kernel_generator/assignment_ops_test.cpp  | 63 +++++++++++++++++-
 6 files changed, 142 insertions(+), 53 deletions(-)

diff --git a/stan/math/opencl/kernel_generator/as_operation_cl.hpp b/stan/math/opencl/kernel_generator/as_operation_cl.hpp
index a7cf2ac7264..8ea743ccdd7 100644
--- a/stan/math/opencl/kernel_generator/as_operation_cl.hpp
+++ b/stan/math/opencl/kernel_generator/as_operation_cl.hpp
@@ -24,7 +24,7 @@ namespace math {
  * @param a an operation
  * @return operation
  */
-template <assignment_ops_cl AssignOp = assignment_ops_cl::equals, typename T_operation,
+template <assign_op_cl AssignOp = assign_op_cl::equals, typename T_operation,
           typename = std::enable_if_t<std::is_base_of<
               operation_cl_base, std::remove_reference_t<T_operation>>::value>>
 inline T_operation&& as_operation_cl(T_operation&& a) {
@@ -38,7 +38,7 @@ inline T_operation&& as_operation_cl(T_operation&& a) {
  * @param a scalar
  * @return \c scalar_ wrapping the input
  */
-template <assignment_ops_cl AssignOp = assignment_ops_cl::equals, typename T_scalar, typename = require_arithmetic_t<T_scalar>,
+template <assign_op_cl AssignOp = assign_op_cl::equals, typename T_scalar, typename = require_arithmetic_t<T_scalar>,
           require_not_same_t<T_scalar, bool>* = nullptr>
 inline scalar_<T_scalar> as_operation_cl(const T_scalar a) {
   return scalar_<T_scalar>(a);
@@ -51,7 +51,7 @@ inline scalar_<T_scalar> as_operation_cl(const T_scalar a) {
  * @param a scalar
  * @return \c scalar_<char> wrapping the input
  */
-template <assignment_ops_cl AssignOp = assignment_ops_cl::equals>
+template <assign_op_cl AssignOp = assign_op_cl::equals>
 inline scalar_<char> as_operation_cl(const bool a) { return scalar_<char>(a); }
 
 /**
@@ -61,7 +61,7 @@ inline scalar_<char> as_operation_cl(const bool a) { return scalar_<char>(a); }
  * @param a \c matrix_cl
  * @return \c load_ wrapping the input
  */
-template <assignment_ops_cl AssignOp = assignment_ops_cl::equals, typename T_matrix_cl,
+template <assign_op_cl AssignOp = assign_op_cl::equals, typename T_matrix_cl,
           typename = require_any_t<is_matrix_cl<T_matrix_cl>,
                                    is_arena_matrix_cl<T_matrix_cl>>>
 inline load_<T_matrix_cl, AssignOp> as_operation_cl(T_matrix_cl&& a) {
@@ -76,7 +76,7 @@ inline load_<T_matrix_cl, AssignOp> as_operation_cl(T_matrix_cl&& a) {
  * rvalue reference, the reference is removed, so that a variable of this type
  * actually stores the value.
  */
-template <typename T, assignment_ops_cl AssignOp = assignment_ops_cl::equals>
+template <typename T, assign_op_cl AssignOp = assign_op_cl::equals>
 using as_operation_cl_t = std::conditional_t<
     std::is_lvalue_reference<T>::value,
     decltype(as_operation_cl<AssignOp>(std::declval<T>())),
diff --git a/stan/math/opencl/kernel_generator/assignment_ops.hpp b/stan/math/opencl/kernel_generator/assignment_ops.hpp
index 5e3919ced95..cca9b8e7a6f 100644
--- a/stan/math/opencl/kernel_generator/assignment_ops.hpp
+++ b/stan/math/opencl/kernel_generator/assignment_ops.hpp
@@ -9,60 +9,56 @@ namespace math {
 /**
  * Ops that decide the type of assignment for LHS operations
  */
-enum class assignment_ops_cl {equals, plus_equals, minus_equals, divide_equals};
+enum class assign_op_cl {equals, plus_equals, minus_equals, divide_equals, multiply_equals};
 
+namespace internal {
 /**
  * @param value A static constexpr const char* member for printing assignment ops
  */
-template <assignment_ops_cl assign_op>
-struct assignment_op_str;
+template <assign_op_cl assign_op>
+struct assignment_op_str_impl;
 
 template <>
-struct assignment_op_str<assignment_ops_cl::equals> {
+struct assignment_op_str_impl<assign_op_cl::equals> {
   static constexpr const char* value = " = ";
 };
 
 template <>
-struct assignment_op_str<assignment_ops_cl::plus_equals> {
+struct assignment_op_str_impl<assign_op_cl::plus_equals> {
   static constexpr const char* value = " += ";
 };
 
 template <>
-struct assignment_op_str<assignment_ops_cl::minus_equals> {
-  static constexpr const char* value = " *= ";
+struct assignment_op_str_impl<assign_op_cl::minus_equals> {
+  static constexpr const char* value = " -= ";
 };
 
 template <>
-struct assignment_op_str<assignment_ops_cl::divide_equals> {
+struct assignment_op_str_impl<assign_op_cl::divide_equals> {
   static constexpr const char* value = " /= ";
 };
 
+template <>
+struct assignment_op_str_impl<assign_op_cl::multiply_equals> {
+  static constexpr const char* value = " *= ";
+};
 
-namespace internal {
 template <typename, typename = void>
-struct has_assignment_op_str : std::false_type {};
+struct assignment_op_str : assignment_op_str_impl<assign_op_cl::equals> {};
 
 template <typename T>
-struct has_assignment_op_str<T, void_t<decltype(T::assignment_op)>> : std::true_type {};
+struct assignment_op_str<T, void_t<decltype(T::assignment_op)>> : assignment_op_str_impl<T::assignment_op> {};
 
 }  // namespace internal
 
-/**
- * @tparam T A type that does not have an `assignment_op` static constexpr member type 
- * @return A constexpr const char* equal to `" = "`
- */
-template <typename T, std::enable_if_t<!internal::has_assignment_op_str<std::decay_t<T>>::value>* = nullptr>
-inline constexpr const char* assignment_op() noexcept {
-  return " = ";
-}
 
 /**
  * @tparam T A type that has an `assignment_op` static constexpr member type
  * @return The types assignment op as a constexpr const char*
  */
-template <typename T, std::enable_if_t<internal::has_assignment_op_str<T>::value>* = nullptr>
+template <typename T>
 inline constexpr const char* assignment_op() noexcept {
-  return assignment_op_str<std::decay_t<T>::assignment_op>::value;
+  return internal::assignment_op_str<std::decay_t<T>>::value;
 }
 
 }
diff --git a/stan/math/opencl/kernel_generator/load.hpp b/stan/math/opencl/kernel_generator/load.hpp
index bd3eaafb1c3..018ecce6e50 100644
--- a/stan/math/opencl/kernel_generator/load.hpp
+++ b/stan/math/opencl/kernel_generator/load.hpp
@@ -27,7 +27,7 @@ namespace math {
  * @tparam T \c matrix_cl
  * @tparam AssignOp tells higher level operations whether the final operation should be an assignment or a type of compound assignment.
  */
-template <typename T, assignment_ops_cl AssignOp = assignment_ops_cl::equals>
+template <typename T, assign_op_cl AssignOp = assign_op_cl::equals>
 class load_
     : public operation_cl_lhs<load_<T, AssignOp>,
                               typename std::remove_reference_t<T>::type> {
@@ -36,7 +36,7 @@ class load_
 
  public:
 
-  static constexpr assignment_ops_cl assignment_op = AssignOp;
+  static constexpr assign_op_cl assignment_op = AssignOp;
   using Scalar = typename std::remove_reference_t<T>::type;
   using base = operation_cl<load_<T, AssignOp>, Scalar>;
   using base::var_name_;
diff --git a/stan/math/opencl/kernel_generator/multi_result_kernel.hpp b/stan/math/opencl/kernel_generator/multi_result_kernel.hpp
index 3b9362ced06..23e8771f73a 100644
--- a/stan/math/opencl/kernel_generator/multi_result_kernel.hpp
+++ b/stan/math/opencl/kernel_generator/multi_result_kernel.hpp
@@ -323,19 +323,19 @@ class results_cl {
     });
   }
 
-  /**
+    /**
    * Incrementing \c results_ object by \c expressions_cl object
    * executes the kernel that evaluates expressions and increments results by
    * those expressions.
    * @tparam T_expressions types of expressions
    * @param exprs expressions
    */
-  template <typename... T_expressions,
+  template <assign_op_cl AssignOp = assign_op_cl::plus_equals, typename... T_expressions,
             typename = std::enable_if_t<sizeof...(T_results)
                                         == sizeof...(T_expressions)>>
-  void operator+=(const expressions_cl<T_expressions...>& exprs) {
+  void compound_assignment_impl(const expressions_cl<T_expressions...>& exprs) {
     index_apply<sizeof...(T_expressions)>([this, &exprs](auto... Is) {
-      auto tmp = std::tuple_cat(make_assignment_pair<assignment_ops_cl::plus_equals>(
+      auto tmp = std::tuple_cat(make_assignment_pair<AssignOp>(
           std::get<Is>(results_), std::get<Is>(exprs.expressions_))...);
       index_apply<std::tuple_size<decltype(tmp)>::value>(
           [this, &tmp](auto... Is2) {
@@ -345,6 +345,20 @@ class results_cl {
     });
   }
 
+  /**
+   * Incrementing \c results_ object by \c expressions_cl object
+   * executes the kernel that evaluates expressions and increments results by
+   * those expressions.
+   * @tparam T_expressions types of expressions
+   * @param exprs expressions
+   */
+  template <typename... T_expressions,
+            typename = std::enable_if_t<sizeof...(T_results)
+                                        == sizeof...(T_expressions)>>
+  void operator+=(const expressions_cl<T_expressions...>& exprs) {
+    compound_assignment_impl<assign_op_cl::plus_equals>(exprs);
+  }
+
     /**
    * Incrementing \c results_ object by \c expressions_cl object
    * executes the kernel that evaluates expressions and increments results by
@@ -356,15 +370,35 @@ class results_cl {
             typename = std::enable_if_t<sizeof...(T_results)
                                         == sizeof...(T_expressions)>>
   void operator-=(const expressions_cl<T_expressions...>& exprs) {
-    index_apply<sizeof...(T_expressions)>([this, &exprs](auto... Is) {
-      auto tmp = std::tuple_cat(make_assignment_pair<assignment_ops_cl::minus_equals>(
-          std::get<Is>(results_), std::get<Is>(exprs.expressions_))...);
-      index_apply<std::tuple_size<decltype(tmp)>::value>(
-          [this, &tmp](auto... Is2) {
-            assignment_impl(std::make_tuple(std::make_pair(
-                std::get<Is2>(tmp).first, std::get<Is2>(tmp).second)...));
-          });
-    });
+    compound_assignment_impl<assign_op_cl::minus_equals>(exprs);
+  }
+
+    /**
+   * Incrementing \c results_ object by \c expressions_cl object
+   * executes the kernel that evaluates expressions and increments results by
+   * those expressions.
+   * @tparam T_expressions types of expressions
+   * @param exprs expressions
+   */
+  template <typename... T_expressions,
+            typename = std::enable_if_t<sizeof...(T_results)
+                                        == sizeof...(T_expressions)>>
+  void operator/=(const expressions_cl<T_expressions...>& exprs) {
+    compound_assignment_impl<assign_op_cl::divide_equals>(exprs);
+  }
+
+    /**
+   * Incrementing \c results_ object by \c expressions_cl object
+   * executes the kernel that evaluates expressions and increments results by
+   * those expressions.
+   * @tparam T_expressions types of expressions
+   * @param exprs expressions
+   */
+  template <typename... T_expressions,
+            typename = std::enable_if_t<sizeof...(T_results)
+                                        == sizeof...(T_expressions)>>
+  void operator*=(const expressions_cl<T_expressions...>& exprs) {
+    compound_assignment_impl<assign_op_cl::multiply_equals>(exprs);
   }
 
   /**
@@ -551,7 +585,7 @@ class results_cl {
    * @param expression expression
    * @return a tuple of pair of result and expression
    */
-  template <assignment_ops_cl AssignOp = assignment_ops_cl::equals, typename T_result, typename T_expression,
+  template <assign_op_cl AssignOp = assign_op_cl::equals, typename T_result, typename T_expression,
             require_all_not_t<is_without_output<T_expression>,
                               conjunction<internal::is_scalar_check<T_result>,
                                           std::is_arithmetic<std::decay_t<
@@ -570,7 +604,7 @@ class results_cl {
    * @param expression expression
    * @return a tuple of pair of result and expression
    */
-  template <assignment_ops_cl AssignOp = assignment_ops_cl::equals, typename T_result, typename T_expression,
+  template <assign_op_cl AssignOp = assign_op_cl::equals, typename T_result, typename T_expression,
             require_t<is_without_output<T_expression>>* = nullptr>
   static auto make_assignment_pair(T_result&& result,
                                    T_expression&& expression) {
@@ -584,7 +618,7 @@ class results_cl {
    * @param pass bool scalar
    * @return an empty tuple
    */
-  template <assignment_ops_cl AssignOp = assignment_ops_cl::equals, typename T_check, typename T_pass,
+  template <assign_op_cl AssignOp = assign_op_cl::equals, typename T_check, typename T_pass,
             require_t<internal::is_scalar_check<T_check>>* = nullptr,
             require_integral_t<T_pass>* = nullptr>
   static std::tuple<> make_assignment_pair(T_check&& result, T_pass&& pass) {
diff --git a/stan/math/opencl/rev/adjoint_results.hpp b/stan/math/opencl/rev/adjoint_results.hpp
index 1a3cd238ec1..b9a9b1a1c9b 100644
--- a/stan/math/opencl/rev/adjoint_results.hpp
+++ b/stan/math/opencl/rev/adjoint_results.hpp
@@ -41,7 +41,7 @@ class adjoint_results_cl : protected results_cl<T_results...> {
     index_apply<sizeof...(T_expressions)>([&](auto... Is) {
       auto scalars = std::tuple_cat(select_scalar_assignments(
           std::get<Is>(this->results_), std::get<Is>(exprs.expressions_))...);
-      auto nonscalars_tmp = std::tuple_cat(select_nonscalar_plusequals(
+      auto nonscalars_tmp = std::tuple_cat(select_nonscalar_assignments<assign_op_cl::plus_equals>(
           std::get<Is>(this->results_), std::get<Is>(exprs.expressions_))...);
 
       index_apply<std::tuple_size<decltype(nonscalars_tmp)>::value>(
@@ -55,7 +55,7 @@ class adjoint_results_cl : protected results_cl<T_results...> {
                   // evaluate all expressions
                   this->assignment_impl(std::tuple_cat(
                       nonscalars,
-                      this->template make_assignment_pair<assignment_ops_cl::plus_equals>(
+                      this->template make_assignment_pair<assign_op_cl::plus_equals>(
                           std::get<2>(std::get<Is_scal>(scalars)),
                           sum_2d(std::get<1>(std::get<Is_scal>(scalars))))...));
 
@@ -109,12 +109,12 @@ class adjoint_results_cl : protected results_cl<T_results...> {
    * @return pair of result and expression or empty tuple (if the result is
    * check or the expression is `calc_if<false,T>`.
    */
-  template <typename T_result, typename T_expression,
+  template <assign_op_cl AssignOp, typename T_result, typename T_expression,
             require_not_stan_scalar_t<T_result>* = nullptr,
             require_st_var<T_result>* = nullptr>
-  auto select_nonscalar_plusequals(T_result&& result,
+  auto select_nonscalar_assignments(T_result&& result,
                                     T_expression&& expression) {
-    return results_cl<T_results...>::template make_assignment_pair<assignment_ops_cl::plus_equals>(
+    return results_cl<T_results...>::template make_assignment_pair<AssignOp>(
         result.adj(), std::forward<T_expression>(expression));
   }
   /**
@@ -126,11 +126,11 @@ class adjoint_results_cl : protected results_cl<T_results...> {
    * @param expression expression
    * @return empty tuple
    */
-  template <
+  template <assign_op_cl AssignOp,
       typename T_result, typename T_expression,
       std::enable_if_t<is_stan_scalar<T_result>::value
                        || !is_var<scalar_type_t<T_result>>::value>* = nullptr>
-  auto select_nonscalar_plusequals(T_result&& result,
+  auto select_nonscalar_assignments(T_result&& result,
                                     T_expression&& expression) {
     return std::make_tuple();
   }
diff --git a/test/unit/math/opencl/kernel_generator/assignment_ops_test.cpp b/test/unit/math/opencl/kernel_generator/assignment_ops_test.cpp
index d9db2f2a7d2..361e72a8630 100644
--- a/test/unit/math/opencl/kernel_generator/assignment_ops_test.cpp
+++ b/test/unit/math/opencl/kernel_generator/assignment_ops_test.cpp
@@ -8,20 +8,79 @@
 #include <gtest/gtest.h>
 #include <string>
 
-TEST(KernelGenerator, assign_ops) {
+TEST(KernelGenerator, plus_equals) {
   using stan::math::matrix_cl;
   using stan::math::var_value;
   using stan::math::var;
   using stan::math::to_matrix_cl;
+  using stan::math::from_matrix_cl;
   Eigen::MatrixXd A = Eigen::MatrixXd::Random(10, 10);
   Eigen::MatrixXd B = Eigen::MatrixXd::Random(10, 10);
   Eigen::MatrixXd C = Eigen::MatrixXd::Random(10, 10);
-  C += A + B;
   matrix_cl<double> A_cl = to_matrix_cl(A);
   matrix_cl<double> B_cl = to_matrix_cl(B);
   matrix_cl<double> C_cl = to_matrix_cl(C);
+  C += A + B;
   results(C_cl) += expressions(A_cl + B_cl);
+  Eigen::MatrixXd C_cl_host = from_matrix_cl(C_cl);
+  EXPECT_MATRIX_EQ(C_cl_host, C)
+}
+
+TEST(KernelGenerator, minus_equals) {
+  using stan::math::matrix_cl;
+  using stan::math::var_value;
+  using stan::math::var;
+  using stan::math::to_matrix_cl;
+  using stan::math::from_matrix_cl;
+  Eigen::MatrixXd A = Eigen::MatrixXd::Random(10, 10);
+  Eigen::MatrixXd B = Eigen::MatrixXd::Random(10, 10);
+  Eigen::MatrixXd C = Eigen::MatrixXd::Random(10, 10);
+  matrix_cl<double> A_cl = to_matrix_cl(A);
+  matrix_cl<double> B_cl = to_matrix_cl(B);
+  matrix_cl<double> C_cl = to_matrix_cl(C);
+  C -= A + B;
+  results(C_cl) -= expressions(A_cl + B_cl);
+  Eigen::MatrixXd C_cl_host = from_matrix_cl(C_cl);
+  EXPECT_MATRIX_EQ(C_cl_host, C)
+}
+
+TEST(KernelGenerator, divide_equals) {
+  using stan::math::matrix_cl;
+  using stan::math::var_value;
+  using stan::math::var;
+  using stan::math::to_matrix_cl;
+  using stan::math::from_matrix_cl;
+  Eigen::MatrixXd A = Eigen::MatrixXd::Random(10, 10);
+  Eigen::MatrixXd B = Eigen::MatrixXd::Random(10, 10);
+  Eigen::MatrixXd C = Eigen::MatrixXd::Random(10, 10);
+  matrix_cl<double> A_cl = to_matrix_cl(A);
+  matrix_cl<double> B_cl = to_matrix_cl(B);
+  matrix_cl<double> C_cl = to_matrix_cl(C);
+  C.array() /= A.array() + B.array();
+  results(C_cl) /= expressions(A_cl + B_cl);
+  Eigen::MatrixXd C_cl_host = from_matrix_cl(C_cl);
+  EXPECT_MATRIX_EQ(C_cl_host, C)
+}
+
+TEST(KernelGenerator, times_equals) {
+  using stan::math::matrix_cl;
+  using stan::math::var_value;
+  using stan::math::var;
+  using stan::math::to_matrix_cl;
+  using stan::math::from_matrix_cl;
+
+  Eigen::MatrixXd A = Eigen::MatrixXd::Random(10, 10);
+  Eigen::MatrixXd B = Eigen::MatrixXd::Random(10, 10);
+  Eigen::MatrixXd C = Eigen::MatrixXd::Random(10, 10);
+  matrix_cl<double> A_cl = to_matrix_cl(A);
+  matrix_cl<double> B_cl = to_matrix_cl(B);
+  matrix_cl<double> C_cl = to_matrix_cl(C);
+  C.array() *= A.array() + B.array();
+  results(C_cl) *= expressions(A_cl + B_cl);
+  Eigen::MatrixXd C_cl_host = from_matrix_cl(C_cl);
+  EXPECT_MATRIX_EQ(C_cl_host, C)
 }
 
 
+
 #endif

From 02554505b0e6e5fa665a67ce564e7b99baac7db2 Mon Sep 17 00:00:00 2001
From: Steve Bronder <sbronder@flatironinstitute.org>
Date: Wed, 13 Sep 2023 12:50:03 -0400
Subject: [PATCH 3/9] update docs

---
 .../kernel_generator/as_operation_cl.hpp      |  8 ++++++++
 .../kernel_generator/multi_result_kernel.hpp  | 20 ++++++++++++++++---
 stan/math/opencl/rev/adjoint_results.hpp      |  4 ++++
 3 files changed, 29 insertions(+), 3 deletions(-)

diff --git a/stan/math/opencl/kernel_generator/as_operation_cl.hpp b/stan/math/opencl/kernel_generator/as_operation_cl.hpp
index 8ea743ccdd7..c6e0d6732a9 100644
--- a/stan/math/opencl/kernel_generator/as_operation_cl.hpp
+++ b/stan/math/opencl/kernel_generator/as_operation_cl.hpp
@@ -20,6 +20,7 @@ namespace math {
 /**
  * Converts any valid kernel generator expression into an operation. This is an
  * overload for operations - a no-op
+ * @tparam AssignOp ignored
  * @tparam T_operation type of the input operation
  * @param a an operation
  * @return operation
@@ -34,6 +35,7 @@ inline T_operation&& as_operation_cl(T_operation&& a) {
 /**
  * Converts any valid kernel generator expression into an operation. This is an
  * overload for scalars (arithmetic types). It wraps them into \c scalar_.
+ * @tparam AssignOp ignored
  * @tparam T_scalar type of the input scalar
  * @param a scalar
  * @return \c scalar_ wrapping the input
@@ -48,6 +50,7 @@ inline scalar_<T_scalar> as_operation_cl(const T_scalar a) {
  * Converts any valid kernel generator expression into an operation. This is an
  * overload for bool scalars. It wraps them into \c scalar_<char> as \c bool can
  * not be used as a type of a kernel argument.
+ * @tparam AssignOp ignored
  * @param a scalar
  * @return \c scalar_<char> wrapping the input
  */
@@ -57,6 +60,8 @@ inline scalar_<char> as_operation_cl(const bool a) { return scalar_<char>(a); }
 /**
  * Converts any valid kernel generator expression into an operation. This is an
  * overload for \c matrix_cl. It wraps them into into \c load_.
+ * @tparam AssignOp an optional `assign_op_cl` that dictates whether the object 
+ *  is assigned using standard or compound assign. 
  * @tparam T_matrix_cl \c matrix_cl
  * @param a \c matrix_cl
  * @return \c load_ wrapping the input
@@ -75,6 +80,9 @@ inline load_<T_matrix_cl, AssignOp> as_operation_cl(T_matrix_cl&& a) {
  * as_operation_cl_t<T>. If the return value of \c as_operation_cl() would be a
  * rvalue reference, the reference is removed, so that a variable of this type
  * actually stores the value.
+ * @tparam T a `matrix_cl` or `Scalar` type 
+ * @tparam AssignOp an optional `assign_op_cl` that dictates whether the object 
+ *  is assigned using standard or compound assign. 
  */
 template <typename T, assign_op_cl AssignOp = assign_op_cl::equals>
 using as_operation_cl_t = std::conditional_t<
diff --git a/stan/math/opencl/kernel_generator/multi_result_kernel.hpp b/stan/math/opencl/kernel_generator/multi_result_kernel.hpp
index 23e8771f73a..8358605aa89 100644
--- a/stan/math/opencl/kernel_generator/multi_result_kernel.hpp
+++ b/stan/math/opencl/kernel_generator/multi_result_kernel.hpp
@@ -327,6 +327,8 @@ class results_cl {
    * Incrementing \c results_ object by \c expressions_cl object
    * executes the kernel that evaluates expressions and increments results by
    * those expressions.
+   * @tparam AssignOp an optional `assign_op_cl` that dictates whether the object 
+   *  is assigned using standard or compound assign. 
    * @tparam T_expressions types of expressions
    * @param exprs expressions
    */
@@ -360,7 +362,7 @@ class results_cl {
   }
 
     /**
-   * Incrementing \c results_ object by \c expressions_cl object
+   * Decrement \c results_ object by \c expressions_cl object
    * executes the kernel that evaluates expressions and increments results by
    * those expressions.
    * @tparam T_expressions types of expressions
@@ -374,7 +376,7 @@ class results_cl {
   }
 
     /**
-   * Incrementing \c results_ object by \c expressions_cl object
+   * Elementwise divide \c results_ object by \c expressions_cl object
    * executes the kernel that evaluates expressions and increments results by
    * those expressions.
    * @tparam T_expressions types of expressions
@@ -388,7 +390,7 @@ class results_cl {
   }
 
     /**
-   * Incrementing \c results_ object by \c expressions_cl object
+   * Elementwise multiply \c results_ object by \c expressions_cl object
    * executes the kernel that evaluates expressions and increments results by
    * those expressions.
    * @tparam T_expressions types of expressions
@@ -581,6 +583,10 @@ class results_cl {
   /**
    * Makes a std::pair of one result and one expression and wraps it into a
    * tuple.
+   * @tparam AssignOp an optional `assign_op_cl` that dictates whether the object 
+   *  is assigned using standard or compound assign. 
+   * @tparam T_result An non scalar type that is normally an `result_cl` operation holding a `matrix_cl` 
+   * @tparam T_expression An expression of set of operations on `matrix_cl` and scalar types.
    * @param result result
    * @param expression expression
    * @return a tuple of pair of result and expression
@@ -600,6 +606,10 @@ class results_cl {
 
   /**
    * If an expression does not need to be calculated this returns an empty tuple
+   * @tparam AssignOp an optional `assign_op_cl` that dictates whether the object 
+   *  is assigned using standard or compound assign. 
+   * @tparam T_result An non scalar type that is normally an `result_cl` operation holding a `matrix_cl` 
+   * @tparam T_expression An expression of set of operations on `matrix_cl` and scalar types.
    * @param result result
    * @param expression expression
    * @return a tuple of pair of result and expression
@@ -614,6 +624,10 @@ class results_cl {
   /**
    * Checks on scalars are done separately in this overload instead of in
    * kernel.
+   * @tparam AssignOp an optional `assign_op_cl` that dictates whether the object 
+   *  is assigned using standard or compound assign. 
+   * @tparam T_check A scalar type
+   * @tparam T_pass An integral type
    * @param result result - check
    * @param pass bool scalar
    * @return an empty tuple
diff --git a/stan/math/opencl/rev/adjoint_results.hpp b/stan/math/opencl/rev/adjoint_results.hpp
index b9a9b1a1c9b..4e671fc6837 100644
--- a/stan/math/opencl/rev/adjoint_results.hpp
+++ b/stan/math/opencl/rev/adjoint_results.hpp
@@ -102,6 +102,8 @@ class adjoint_results_cl : protected results_cl<T_results...> {
 
   /**
    * Selects assignments that have non-scalar var results.
+   * @tparam AssignOp an optional `assign_op_cl` that dictates whether the object 
+   *  is assigned using standard or compound assign. 
    * @tparam T_result type of result. This overload is used for non-scalar vars.
    * @tparam T_expression type of expression
    * @param result result
@@ -119,6 +121,8 @@ class adjoint_results_cl : protected results_cl<T_results...> {
   }
   /**
    * Selects assignments that have non-scalar var results.
+   * @tparam AssignOp an optional `assign_op_cl` that dictates whether the object 
+   *  is assigned using standard or compound assign. 
    * @tparam T_result type of result. This overload is used for results that are
    * either scalars or not vars.
    * @tparam T_expression type of expression

From ab11b32212641bdc08250ec80afaa6ce678f5ce1 Mon Sep 17 00:00:00 2001
From: Stan Jenkins <mc.stanislaw@gmail.com>
Date: Wed, 13 Sep 2023 14:16:51 -0400
Subject: [PATCH 4/9] [Jenkins] auto-formatting by clang-format version
 10.0.0-4ubuntu1

---
 .../kernel_generator/as_operation_cl.hpp      | 26 +++++----
 .../kernel_generator/assignment_ops.hpp       | 19 ++++--
 stan/math/opencl/kernel_generator/load.hpp    | 12 ++--
 .../kernel_generator/multi_result_kernel.hpp  | 58 +++++++++++--------
 .../opencl/kernel_generator/operation_cl.hpp  | 30 +++++-----
 stan/math/opencl/rev/adjoint_results.hpp      | 30 +++++-----
 stan/math/opencl/rev/grad.hpp                 |  3 +-
 .../kernel_generator/assignment_ops_test.cpp  | 26 ++++-----
 test/unit/math/opencl/rev/add_test.cpp        |  4 +-
 test/unit/math/opencl/rev/grad_test.cpp       |  9 ++-
 10 files changed, 118 insertions(+), 99 deletions(-)

diff --git a/stan/math/opencl/kernel_generator/as_operation_cl.hpp b/stan/math/opencl/kernel_generator/as_operation_cl.hpp
index c6e0d6732a9..b87e4fb0a9d 100644
--- a/stan/math/opencl/kernel_generator/as_operation_cl.hpp
+++ b/stan/math/opencl/kernel_generator/as_operation_cl.hpp
@@ -40,7 +40,8 @@ inline T_operation&& as_operation_cl(T_operation&& a) {
  * @param a scalar
  * @return \c scalar_ wrapping the input
  */
-template <assign_op_cl AssignOp = assign_op_cl::equals, typename T_scalar, typename = require_arithmetic_t<T_scalar>,
+template <assign_op_cl AssignOp = assign_op_cl::equals, typename T_scalar,
+          typename = require_arithmetic_t<T_scalar>,
           require_not_same_t<T_scalar, bool>* = nullptr>
 inline scalar_<T_scalar> as_operation_cl(const T_scalar a) {
   return scalar_<T_scalar>(a);
@@ -55,13 +56,15 @@ inline scalar_<T_scalar> as_operation_cl(const T_scalar a) {
  * @return \c scalar_<char> wrapping the input
  */
 template <assign_op_cl AssignOp = assign_op_cl::equals>
-inline scalar_<char> as_operation_cl(const bool a) { return scalar_<char>(a); }
+inline scalar_<char> as_operation_cl(const bool a) {
+  return scalar_<char>(a);
+}
 
 /**
  * Converts any valid kernel generator expression into an operation. This is an
  * overload for \c matrix_cl. It wraps them into into \c load_.
- * @tparam AssignOp an optional `assign_op_cl` that dictates whether the object 
- *  is assigned using standard or compound assign. 
+ * @tparam AssignOp an optional `assign_op_cl` that dictates whether the object
+ *  is assigned using standard or compound assign.
  * @tparam T_matrix_cl \c matrix_cl
  * @param a \c matrix_cl
  * @return \c load_ wrapping the input
@@ -80,15 +83,16 @@ inline load_<T_matrix_cl, AssignOp> as_operation_cl(T_matrix_cl&& a) {
  * as_operation_cl_t<T>. If the return value of \c as_operation_cl() would be a
  * rvalue reference, the reference is removed, so that a variable of this type
  * actually stores the value.
- * @tparam T a `matrix_cl` or `Scalar` type 
- * @tparam AssignOp an optional `assign_op_cl` that dictates whether the object 
- *  is assigned using standard or compound assign. 
+ * @tparam T a `matrix_cl` or `Scalar` type
+ * @tparam AssignOp an optional `assign_op_cl` that dictates whether the object
+ *  is assigned using standard or compound assign.
  */
 template <typename T, assign_op_cl AssignOp = assign_op_cl::equals>
-using as_operation_cl_t = std::conditional_t<
-    std::is_lvalue_reference<T>::value,
-    decltype(as_operation_cl<AssignOp>(std::declval<T>())),
-    std::remove_reference_t<decltype(as_operation_cl<AssignOp>(std::declval<T>()))>>;
+using as_operation_cl_t
+    = std::conditional_t<std::is_lvalue_reference<T>::value,
+                         decltype(as_operation_cl<AssignOp>(std::declval<T>())),
+                         std::remove_reference_t<decltype(
+                             as_operation_cl<AssignOp>(std::declval<T>()))>>;
 
 /** @}*/
 }  // namespace math
diff --git a/stan/math/opencl/kernel_generator/assignment_ops.hpp b/stan/math/opencl/kernel_generator/assignment_ops.hpp
index cca9b8e7a6f..a365492a8e1 100644
--- a/stan/math/opencl/kernel_generator/assignment_ops.hpp
+++ b/stan/math/opencl/kernel_generator/assignment_ops.hpp
@@ -9,11 +9,18 @@ namespace math {
 /**
  * Ops that decide the type of assignment for LHS operations
  */
-enum class assign_op_cl {equals, plus_equals, minus_equals, divide_equals, multiply_equals};
+enum class assign_op_cl {
+  equals,
+  plus_equals,
+  minus_equals,
+  divide_equals,
+  multiply_equals
+};
 
 namespace internal {
 /**
- * @param value A static constexpr const char* member for printing assignment ops
+ * @param value A static constexpr const char* member for printing assignment
+ * ops
  */
 template <assign_op_cl assign_op>
 struct assignment_op_str_impl;
@@ -47,11 +54,11 @@ template <typename, typename = void>
 struct assignment_op_str : assignment_op_str_impl<assign_op_cl::equals> {};
 
 template <typename T>
-struct assignment_op_str<T, void_t<decltype(T::assignment_op)>> : assignment_op_str_impl<T::assignment_op> {};
+struct assignment_op_str<T, void_t<decltype(T::assignment_op)>>
+    : assignment_op_str_impl<T::assignment_op> {};
 
 }  // namespace internal
 
-
 /**
  * @tparam T A type that has an `assignment_op` static constexpr member type
  * @return The types assignment op as a constexpr const char*
@@ -61,7 +68,7 @@ inline constexpr const char* assignment_op() noexcept {
   return internal::assignment_op_str<std::decay_t<T>>::value;
 }
 
-}
-}
+}  // namespace math
+}  // namespace stan
 #endif
 #endif
diff --git a/stan/math/opencl/kernel_generator/load.hpp b/stan/math/opencl/kernel_generator/load.hpp
index 018ecce6e50..6f48252d89e 100644
--- a/stan/math/opencl/kernel_generator/load.hpp
+++ b/stan/math/opencl/kernel_generator/load.hpp
@@ -25,7 +25,8 @@ namespace math {
 /**
  * Represents an access to a \c matrix_cl in kernel generator expressions
  * @tparam T \c matrix_cl
- * @tparam AssignOp tells higher level operations whether the final operation should be an assignment or a type of compound assignment.
+ * @tparam AssignOp tells higher level operations whether the final operation
+ * should be an assignment or a type of compound assignment.
  */
 template <typename T, assign_op_cl AssignOp = assign_op_cl::equals>
 class load_
@@ -35,7 +36,6 @@ class load_
   T a_;
 
  public:
-
   static constexpr assign_op_cl assignment_op = AssignOp;
   using Scalar = typename std::remove_reference_t<T>::type;
   using base = operation_cl<load_<T, AssignOp>, Scalar>;
@@ -57,8 +57,12 @@ class load_
    * @return copy of \c *this
    */
   inline load_<T&, AssignOp> deep_copy() & { return load_<T&, AssignOp>(a_); }
-  inline load_<const T&, AssignOp> deep_copy() const& { return load_<const T&, AssignOp>(a_); }
-  inline load_<T, AssignOp> deep_copy() && { return load_<T, AssignOp>(std::forward<T>(a_)); }
+  inline load_<const T&, AssignOp> deep_copy() const& {
+    return load_<const T&, AssignOp>(a_);
+  }
+  inline load_<T, AssignOp> deep_copy() && {
+    return load_<T, AssignOp>(std::forward<T>(a_));
+  }
 
   /**
    * Generates kernel code for this expression.
diff --git a/stan/math/opencl/kernel_generator/multi_result_kernel.hpp b/stan/math/opencl/kernel_generator/multi_result_kernel.hpp
index 8358605aa89..167cd44ed13 100644
--- a/stan/math/opencl/kernel_generator/multi_result_kernel.hpp
+++ b/stan/math/opencl/kernel_generator/multi_result_kernel.hpp
@@ -323,16 +323,17 @@ class results_cl {
     });
   }
 
-    /**
+  /**
    * Incrementing \c results_ object by \c expressions_cl object
    * executes the kernel that evaluates expressions and increments results by
    * those expressions.
-   * @tparam AssignOp an optional `assign_op_cl` that dictates whether the object 
-   *  is assigned using standard or compound assign. 
+   * @tparam AssignOp an optional `assign_op_cl` that dictates whether the
+   * object is assigned using standard or compound assign.
    * @tparam T_expressions types of expressions
    * @param exprs expressions
    */
-  template <assign_op_cl AssignOp = assign_op_cl::plus_equals, typename... T_expressions,
+  template <assign_op_cl AssignOp = assign_op_cl::plus_equals,
+            typename... T_expressions,
             typename = std::enable_if_t<sizeof...(T_results)
                                         == sizeof...(T_expressions)>>
   void compound_assignment_impl(const expressions_cl<T_expressions...>& exprs) {
@@ -361,7 +362,7 @@ class results_cl {
     compound_assignment_impl<assign_op_cl::plus_equals>(exprs);
   }
 
-    /**
+  /**
    * Decrement \c results_ object by \c expressions_cl object
    * executes the kernel that evaluates expressions and increments results by
    * those expressions.
@@ -375,7 +376,7 @@ class results_cl {
     compound_assignment_impl<assign_op_cl::minus_equals>(exprs);
   }
 
-    /**
+  /**
    * Elementwise divide \c results_ object by \c expressions_cl object
    * executes the kernel that evaluates expressions and increments results by
    * those expressions.
@@ -389,7 +390,7 @@ class results_cl {
     compound_assignment_impl<assign_op_cl::divide_equals>(exprs);
   }
 
-    /**
+  /**
    * Elementwise multiply \c results_ object by \c expressions_cl object
    * executes the kernel that evaluates expressions and increments results by
    * those expressions.
@@ -484,7 +485,7 @@ class results_cl {
           + parts.reduction_2d +
           "}\n";
     }
-     return src;
+    return src;
   }
 
   /**
@@ -583,38 +584,44 @@ class results_cl {
   /**
    * Makes a std::pair of one result and one expression and wraps it into a
    * tuple.
-   * @tparam AssignOp an optional `assign_op_cl` that dictates whether the object 
-   *  is assigned using standard or compound assign. 
-   * @tparam T_result An non scalar type that is normally an `result_cl` operation holding a `matrix_cl` 
-   * @tparam T_expression An expression of set of operations on `matrix_cl` and scalar types.
+   * @tparam AssignOp an optional `assign_op_cl` that dictates whether the
+   * object is assigned using standard or compound assign.
+   * @tparam T_result An non scalar type that is normally an `result_cl`
+   * operation holding a `matrix_cl`
+   * @tparam T_expression An expression of set of operations on `matrix_cl` and
+   * scalar types.
    * @param result result
    * @param expression expression
    * @return a tuple of pair of result and expression
    */
-  template <assign_op_cl AssignOp = assign_op_cl::equals, typename T_result, typename T_expression,
+  template <assign_op_cl AssignOp = assign_op_cl::equals, typename T_result,
+            typename T_expression,
             require_all_not_t<is_without_output<T_expression>,
                               conjunction<internal::is_scalar_check<T_result>,
                                           std::is_arithmetic<std::decay_t<
                                               T_expression>>>>* = nullptr>
   static auto make_assignment_pair(T_result&& result,
                                    T_expression&& expression) {
-    return std::make_tuple(
-        std::pair<as_operation_cl_t<T_result, AssignOp>, as_operation_cl_t<T_expression>>(
-            as_operation_cl<AssignOp>(std::forward<T_result>(result)),
-            as_operation_cl(std::forward<T_expression>(expression))));
+    return std::make_tuple(std::pair<as_operation_cl_t<T_result, AssignOp>,
+                                     as_operation_cl_t<T_expression>>(
+        as_operation_cl<AssignOp>(std::forward<T_result>(result)),
+        as_operation_cl(std::forward<T_expression>(expression))));
   }
 
   /**
    * If an expression does not need to be calculated this returns an empty tuple
-   * @tparam AssignOp an optional `assign_op_cl` that dictates whether the object 
-   *  is assigned using standard or compound assign. 
-   * @tparam T_result An non scalar type that is normally an `result_cl` operation holding a `matrix_cl` 
-   * @tparam T_expression An expression of set of operations on `matrix_cl` and scalar types.
+   * @tparam AssignOp an optional `assign_op_cl` that dictates whether the
+   * object is assigned using standard or compound assign.
+   * @tparam T_result An non scalar type that is normally an `result_cl`
+   * operation holding a `matrix_cl`
+   * @tparam T_expression An expression of set of operations on `matrix_cl` and
+   * scalar types.
    * @param result result
    * @param expression expression
    * @return a tuple of pair of result and expression
    */
-  template <assign_op_cl AssignOp = assign_op_cl::equals, typename T_result, typename T_expression,
+  template <assign_op_cl AssignOp = assign_op_cl::equals, typename T_result,
+            typename T_expression,
             require_t<is_without_output<T_expression>>* = nullptr>
   static auto make_assignment_pair(T_result&& result,
                                    T_expression&& expression) {
@@ -624,15 +631,16 @@ class results_cl {
   /**
    * Checks on scalars are done separately in this overload instead of in
    * kernel.
-   * @tparam AssignOp an optional `assign_op_cl` that dictates whether the object 
-   *  is assigned using standard or compound assign. 
+   * @tparam AssignOp an optional `assign_op_cl` that dictates whether the
+   * object is assigned using standard or compound assign.
    * @tparam T_check A scalar type
    * @tparam T_pass An integral type
    * @param result result - check
    * @param pass bool scalar
    * @return an empty tuple
    */
-  template <assign_op_cl AssignOp = assign_op_cl::equals, typename T_check, typename T_pass,
+  template <assign_op_cl AssignOp = assign_op_cl::equals, typename T_check,
+            typename T_pass,
             require_t<internal::is_scalar_check<T_check>>* = nullptr,
             require_integral_t<T_pass>* = nullptr>
   static std::tuple<> make_assignment_pair(T_check&& result, T_pass&& pass) {
diff --git a/stan/math/opencl/kernel_generator/operation_cl.hpp b/stan/math/opencl/kernel_generator/operation_cl.hpp
index 84f026dbdf6..f5995484da0 100644
--- a/stan/math/opencl/kernel_generator/operation_cl.hpp
+++ b/stan/math/opencl/kernel_generator/operation_cl.hpp
@@ -76,21 +76,21 @@ struct kernel_parts {
 };
 
 std::ostream& operator<<(std::ostream& os, kernel_parts& parts) {
-      os << "args:" << std::endl;
-      os <<                parts.args.substr(0, parts.args.size() - 2)  << std::endl;
-      os << "Decl:" << std::endl;
-           os << parts.declarations << std::endl;
-           os << "Init:" << std::endl;
-           os << parts.initialization << std::endl;
-           os << "body:" << std::endl;
-           os << parts.body << std::endl;
-           os << "body_suffix:" << std::endl;
-           os << parts.body_suffix << std::endl;
-           os << "reduction_1d:" << std::endl;
-           os << parts.reduction_1d << std::endl;
-           os << "reduction_2d:" << std::endl;
-           os << parts.reduction_2d << std::endl;
-    return os;
+  os << "args:" << std::endl;
+  os << parts.args.substr(0, parts.args.size() - 2) << std::endl;
+  os << "Decl:" << std::endl;
+  os << parts.declarations << std::endl;
+  os << "Init:" << std::endl;
+  os << parts.initialization << std::endl;
+  os << "body:" << std::endl;
+  os << parts.body << std::endl;
+  os << "body_suffix:" << std::endl;
+  os << parts.body_suffix << std::endl;
+  os << "reduction_1d:" << std::endl;
+  os << parts.reduction_1d << std::endl;
+  os << "reduction_2d:" << std::endl;
+  os << parts.reduction_2d << std::endl;
+  return os;
 }
 
 /**
diff --git a/stan/math/opencl/rev/adjoint_results.hpp b/stan/math/opencl/rev/adjoint_results.hpp
index 4e671fc6837..4a2dede57d0 100644
--- a/stan/math/opencl/rev/adjoint_results.hpp
+++ b/stan/math/opencl/rev/adjoint_results.hpp
@@ -41,21 +41,24 @@ class adjoint_results_cl : protected results_cl<T_results...> {
     index_apply<sizeof...(T_expressions)>([&](auto... Is) {
       auto scalars = std::tuple_cat(select_scalar_assignments(
           std::get<Is>(this->results_), std::get<Is>(exprs.expressions_))...);
-      auto nonscalars_tmp = std::tuple_cat(select_nonscalar_assignments<assign_op_cl::plus_equals>(
-          std::get<Is>(this->results_), std::get<Is>(exprs.expressions_))...);
+      auto nonscalars_tmp = std::tuple_cat(
+          select_nonscalar_assignments<assign_op_cl::plus_equals>(
+              std::get<Is>(this->results_),
+              std::get<Is>(exprs.expressions_))...);
 
       index_apply<std::tuple_size<decltype(nonscalars_tmp)>::value>(
           [&](auto... Is_nonscal) {
-            auto nonscalars = std::make_tuple(std::make_pair(
-                std::get<Is_nonscal>(nonscalars_tmp).first, std::get<Is_nonscal>(nonscalars_tmp).second)...);
- 
- 
+            auto nonscalars = std::make_tuple(
+                std::make_pair(std::get<Is_nonscal>(nonscalars_tmp).first,
+                               std::get<Is_nonscal>(nonscalars_tmp).second)...);
+
             index_apply<std::tuple_size<decltype(scalars)>::value>(
                 [&](auto... Is_scal) {
                   // evaluate all expressions
                   this->assignment_impl(std::tuple_cat(
                       nonscalars,
-                      this->template make_assignment_pair<assign_op_cl::plus_equals>(
+                      this->template make_assignment_pair<
+                          assign_op_cl::plus_equals>(
                           std::get<2>(std::get<Is_scal>(scalars)),
                           sum_2d(std::get<1>(std::get<Is_scal>(scalars))))...));
 
@@ -99,11 +102,10 @@ class adjoint_results_cl : protected results_cl<T_results...> {
     return std::make_tuple();
   }
 
-
   /**
    * Selects assignments that have non-scalar var results.
-   * @tparam AssignOp an optional `assign_op_cl` that dictates whether the object 
-   *  is assigned using standard or compound assign. 
+   * @tparam AssignOp an optional `assign_op_cl` that dictates whether the
+   * object is assigned using standard or compound assign.
    * @tparam T_result type of result. This overload is used for non-scalar vars.
    * @tparam T_expression type of expression
    * @param result result
@@ -121,8 +123,8 @@ class adjoint_results_cl : protected results_cl<T_results...> {
   }
   /**
    * Selects assignments that have non-scalar var results.
-   * @tparam AssignOp an optional `assign_op_cl` that dictates whether the object 
-   *  is assigned using standard or compound assign. 
+   * @tparam AssignOp an optional `assign_op_cl` that dictates whether the
+   * object is assigned using standard or compound assign.
    * @tparam T_result type of result. This overload is used for results that are
    * either scalars or not vars.
    * @tparam T_expression type of expression
@@ -130,8 +132,8 @@ class adjoint_results_cl : protected results_cl<T_results...> {
    * @param expression expression
    * @return empty tuple
    */
-  template <assign_op_cl AssignOp,
-      typename T_result, typename T_expression,
+  template <
+      assign_op_cl AssignOp, typename T_result, typename T_expression,
       std::enable_if_t<is_stan_scalar<T_result>::value
                        || !is_var<scalar_type_t<T_result>>::value>* = nullptr>
   auto select_nonscalar_assignments(T_result&& result,
diff --git a/stan/math/opencl/rev/grad.hpp b/stan/math/opencl/rev/grad.hpp
index 46b0e7cb0d7..f4a1dde1dc9 100644
--- a/stan/math/opencl/rev/grad.hpp
+++ b/stan/math/opencl/rev/grad.hpp
@@ -19,8 +19,7 @@ namespace math {
  * @param[in] x Variables being differentiated with respect to
  * @param[out] g Gradient, d/dx v, evaluated at x.
  */
-inline void grad(var& v, var_value<matrix_cl<double>>& x,
-                 Eigen::VectorXd& g) {
+inline void grad(var& v, var_value<matrix_cl<double>>& x, Eigen::VectorXd& g) {
   grad(v.vi_);
   g = from_matrix_cl<Eigen::VectorXd>(x.adj());
 }
diff --git a/test/unit/math/opencl/kernel_generator/assignment_ops_test.cpp b/test/unit/math/opencl/kernel_generator/assignment_ops_test.cpp
index 361e72a8630..3675dbd0a0f 100644
--- a/test/unit/math/opencl/kernel_generator/assignment_ops_test.cpp
+++ b/test/unit/math/opencl/kernel_generator/assignment_ops_test.cpp
@@ -9,11 +9,11 @@
 #include <string>
 
 TEST(KernelGenerator, plus_equals) {
+  using stan::math::from_matrix_cl;
   using stan::math::matrix_cl;
-  using stan::math::var_value;
-  using stan::math::var;
   using stan::math::to_matrix_cl;
-  using stan::math::from_matrix_cl;
+  using stan::math::var;
+  using stan::math::var_value;
   Eigen::MatrixXd A = Eigen::MatrixXd::Random(10, 10);
   Eigen::MatrixXd B = Eigen::MatrixXd::Random(10, 10);
   Eigen::MatrixXd C = Eigen::MatrixXd::Random(10, 10);
@@ -27,11 +27,11 @@ TEST(KernelGenerator, plus_equals) {
 }
 
 TEST(KernelGenerator, minus_equals) {
+  using stan::math::from_matrix_cl;
   using stan::math::matrix_cl;
-  using stan::math::var_value;
-  using stan::math::var;
   using stan::math::to_matrix_cl;
-  using stan::math::from_matrix_cl;
+  using stan::math::var;
+  using stan::math::var_value;
   Eigen::MatrixXd A = Eigen::MatrixXd::Random(10, 10);
   Eigen::MatrixXd B = Eigen::MatrixXd::Random(10, 10);
   Eigen::MatrixXd C = Eigen::MatrixXd::Random(10, 10);
@@ -45,11 +45,11 @@ TEST(KernelGenerator, minus_equals) {
 }
 
 TEST(KernelGenerator, divide_equals) {
+  using stan::math::from_matrix_cl;
   using stan::math::matrix_cl;
-  using stan::math::var_value;
-  using stan::math::var;
   using stan::math::to_matrix_cl;
-  using stan::math::from_matrix_cl;
+  using stan::math::var;
+  using stan::math::var_value;
   Eigen::MatrixXd A = Eigen::MatrixXd::Random(10, 10);
   Eigen::MatrixXd B = Eigen::MatrixXd::Random(10, 10);
   Eigen::MatrixXd C = Eigen::MatrixXd::Random(10, 10);
@@ -63,11 +63,11 @@ TEST(KernelGenerator, divide_equals) {
 }
 
 TEST(KernelGenerator, times_equals) {
+  using stan::math::from_matrix_cl;
   using stan::math::matrix_cl;
-  using stan::math::var_value;
-  using stan::math::var;
   using stan::math::to_matrix_cl;
-  using stan::math::from_matrix_cl;
+  using stan::math::var;
+  using stan::math::var_value;
 
   Eigen::MatrixXd A = Eigen::MatrixXd::Random(10, 10);
   Eigen::MatrixXd B = Eigen::MatrixXd::Random(10, 10);
@@ -81,6 +81,4 @@ TEST(KernelGenerator, times_equals) {
   EXPECT_MATRIX_EQ(C_cl_host, C)
 }
 
-
-
 #endif
diff --git a/test/unit/math/opencl/rev/add_test.cpp b/test/unit/math/opencl/rev/add_test.cpp
index e07c438f828..27950a35bf1 100644
--- a/test/unit/math/opencl/rev/add_test.cpp
+++ b/test/unit/math/opencl/rev/add_test.cpp
@@ -5,14 +5,12 @@
 #include <gtest/gtest.h>
 #include <algorithm>
 
-
-
 TEST(OpenCLPrim, add_aliasing) {
   stan::math::matrix_d d1(3, 3);
   d1 << 1, 2, 3, 4, 5, 6, 7, 8, 9;
   using stan::math::matrix_cl;
-  using stan::math::var_value;
   using stan::math::var;
+  using stan::math::var_value;
   using varmat_cl = var_value<matrix_cl<double>>;
   varmat_cl d11 = stan::math::to_matrix_cl(d1);
   // Add the same matrix as the left and right hand side
diff --git a/test/unit/math/opencl/rev/grad_test.cpp b/test/unit/math/opencl/rev/grad_test.cpp
index e24a70d26b9..23d9ee330e7 100644
--- a/test/unit/math/opencl/rev/grad_test.cpp
+++ b/test/unit/math/opencl/rev/grad_test.cpp
@@ -6,13 +6,13 @@
 #include <algorithm>
 
 TEST(OpenCLGradTest, exceptions) {
-  using stan::math::var_value;
-  using stan::math::var;
   using stan::math::matrix_cl;
   using stan::math::to_matrix_cl;
+  using stan::math::var;
+  using stan::math::var_value;
   using varmat_cl = var_value<matrix_cl<double>>;
   Eigen::VectorXd a(6);
-  a << 1, 2, 3 , 4, 5, 6;
+  a << 1, 2, 3, 4, 5, 6;
   varmat_cl a_cl(to_matrix_cl(a));
   varmat_cl b_cl(a_cl.block(0, 0, 3, 1));
   varmat_cl c_cl(a_cl.block(3, 0, 3, 1));
@@ -25,7 +25,7 @@ TEST(OpenCLGradTest, exceptions) {
   stan::math::recover_memory();
   Eigen::Matrix<var, -1, 1> a_host = a;
   Eigen::Matrix<var, -1, 1> b_host = a_host.segment(0, 3);
-  Eigen::Matrix<var, -1, 1> c_host = a_host.segment(3, 3);  
+  Eigen::Matrix<var, -1, 1> c_host = a_host.segment(3, 3);
   var ret_host = stan::math::sum(add(b_host, b_host));
   Eigen::VectorXd ret_grads_host(6);
   stan::math::grad(ret_host, a_host, ret_grads_host);
@@ -33,5 +33,4 @@ TEST(OpenCLGradTest, exceptions) {
   stan::math::recover_memory();
 }
 
-
 #endif

From c6bc259c9a4a23e77d7d28a63b25ca0a785f9f9a Mon Sep 17 00:00:00 2001
From: Steve Bronder <sbronder@flatironinstitute.org>
Date: Wed, 13 Sep 2023 14:18:14 -0400
Subject: [PATCH 5/9] add back rest of add test

---
 test/unit/math/opencl/rev/add_test.cpp | 172 ++++++++++++++++++++++++-
 1 file changed, 171 insertions(+), 1 deletion(-)

diff --git a/test/unit/math/opencl/rev/add_test.cpp b/test/unit/math/opencl/rev/add_test.cpp
index 27950a35bf1..d64f360030e 100644
--- a/test/unit/math/opencl/rev/add_test.cpp
+++ b/test/unit/math/opencl/rev/add_test.cpp
@@ -5,12 +5,182 @@
 #include <gtest/gtest.h>
 #include <algorithm>
 
+TEST(OpenCLPrim, add_exceptions) {
+  stan::math::vector_d vd1(2), vd2(3);
+  stan::math::matrix_cl<double> vd11(vd1);
+  stan::math::matrix_cl<double> vd22(vd2);
+  EXPECT_THROW(stan::math::add(vd11, vd22), std::invalid_argument);
+
+  stan::math::row_vector_d rvd1(2), rvd2(3);
+  stan::math::matrix_cl<double> rvd11(rvd1);
+  stan::math::matrix_cl<double> rvd22(rvd2);
+  EXPECT_THROW(stan::math::add(rvd11, rvd22), std::invalid_argument);
+
+  stan::math::matrix_d md1(2, 2), md2(3, 3);
+  stan::math::matrix_cl<double> md11(md1);
+  stan::math::matrix_cl<double> md22(md2);
+  EXPECT_THROW(stan::math::add(md11, md22), std::invalid_argument);
+}
+
+TEST(OpenCLPrim, add_tri_value_check) {
+  Eigen::MatrixXd a(3, 3);
+  a << 1, 2, 3, 4, 5, 6, 7, 8, 9;
+  Eigen::MatrixXd b = Eigen::MatrixXd::Ones(3, 3) * -3;
+
+  stan::math::matrix_cl<double> a_cl(a);
+  stan::math::matrix_cl<double> b_cl(b);
+  stan::math::matrix_cl<double> c_cl(3, 3);
+  stan::math::matrix_cl<double> c_cl_fun(3, 3);
+  Eigen::MatrixXd c(3, 3);
+
+  a_cl.view(stan::math::matrix_cl_view::Lower);
+  b_cl.view(stan::math::matrix_cl_view::Lower);
+  c_cl = a_cl + b_cl;
+  EXPECT_EQ(c_cl.view(), stan::math::matrix_cl_view::Lower);
+  c = stan::math::from_matrix_cl(c_cl);
+  EXPECT_MATRIX_NEAR((Eigen::MatrixXd(a.triangularView<Eigen::Lower>())
+                      + Eigen::MatrixXd(b.triangularView<Eigen::Lower>())),
+                     c, 1E-8);
+
+  c_cl_fun = add(a_cl, b_cl);
+  EXPECT_EQ(c_cl_fun.view(), stan::math::matrix_cl_view::Lower);
+  c = stan::math::from_matrix_cl(c_cl_fun);
+  EXPECT_MATRIX_NEAR((Eigen::MatrixXd(a.triangularView<Eigen::Lower>())
+                      + Eigen::MatrixXd(b.triangularView<Eigen::Lower>())),
+                     c, 1E-8);
+
+  a_cl.view(stan::math::matrix_cl_view::Lower);
+  b_cl.view(stan::math::matrix_cl_view::Upper);
+  c_cl = a_cl + b_cl;
+  EXPECT_EQ(c_cl.view(), stan::math::matrix_cl_view::Entire);
+  c = stan::math::from_matrix_cl(c_cl);
+  EXPECT_MATRIX_NEAR((Eigen::MatrixXd(a.triangularView<Eigen::Lower>())
+                      + Eigen::MatrixXd(b.triangularView<Eigen::Upper>())),
+                     c, 1E-8);
+
+  c_cl_fun = add(a_cl, b_cl);
+  EXPECT_EQ(c_cl_fun.view(), stan::math::matrix_cl_view::Entire);
+  c = stan::math::from_matrix_cl(c_cl_fun);
+  EXPECT_MATRIX_NEAR((Eigen::MatrixXd(a.triangularView<Eigen::Lower>())
+                      + Eigen::MatrixXd(b.triangularView<Eigen::Upper>())),
+                     c, 1E-8);
+
+  a_cl.view(stan::math::matrix_cl_view::Upper);
+  b_cl.view(stan::math::matrix_cl_view::Lower);
+  c_cl = a_cl + b_cl;
+  EXPECT_EQ(c_cl.view(), stan::math::matrix_cl_view::Entire);
+  c = stan::math::from_matrix_cl(c_cl);
+  EXPECT_MATRIX_NEAR((Eigen::MatrixXd(a.triangularView<Eigen::Upper>())
+                      + Eigen::MatrixXd(b.triangularView<Eigen::Lower>())),
+                     c, 1E-8);
+
+  c_cl_fun = add(a_cl, b_cl);
+  EXPECT_EQ(c_cl_fun.view(), stan::math::matrix_cl_view::Entire);
+  c = stan::math::from_matrix_cl(c_cl_fun);
+  EXPECT_MATRIX_NEAR((Eigen::MatrixXd(a.triangularView<Eigen::Upper>())
+                      + Eigen::MatrixXd(b.triangularView<Eigen::Lower>())),
+                     c, 1E-8);
+
+  a_cl.view(stan::math::matrix_cl_view::Entire);
+  b_cl.view(stan::math::matrix_cl_view::Lower);
+  c_cl = a_cl + b_cl;
+  EXPECT_EQ(c_cl.view(), stan::math::matrix_cl_view::Entire);
+  c = stan::math::from_matrix_cl(c_cl);
+  EXPECT_MATRIX_NEAR((a + Eigen::MatrixXd(b.triangularView<Eigen::Lower>())), c,
+                     1E-8);
+
+  c_cl_fun = add(a_cl, b_cl);
+  EXPECT_EQ(c_cl_fun.view(), stan::math::matrix_cl_view::Entire);
+  c = stan::math::from_matrix_cl(c_cl_fun);
+  EXPECT_MATRIX_NEAR((a + Eigen::MatrixXd(b.triangularView<Eigen::Lower>())), c,
+                     1E-8);
+}
+
+TEST(OpenCLPrim, add_tri_scalar_value_check) {
+  Eigen::MatrixXd a(3, 3);
+  a << 1, 2, 3, 4, 5, 6, 7, 8, 9;
+  stan::math::matrix_cl<double> a_cl(a);
+  stan::math::matrix_cl<double> c_cl(3, 3);
+  Eigen::MatrixXd c(3, 3);
+
+  using stan::math::add;
+  a_cl.view(stan::math::matrix_cl_view::Lower);
+  c_cl = add(a_cl, 1.5);
+  EXPECT_EQ(c_cl.view(), stan::math::matrix_cl_view::Entire);
+  c = stan::math::from_matrix_cl(c_cl);
+  EXPECT_MATRIX_NEAR(
+      add(Eigen::MatrixXd(a.triangularView<Eigen::Lower>()), 1.5), c, 1E-8);
+
+  a_cl.view(stan::math::matrix_cl_view::Lower);
+  c_cl = add(1.5, a_cl);
+  EXPECT_EQ(c_cl.view(), stan::math::matrix_cl_view::Entire);
+  c = stan::math::from_matrix_cl(c_cl);
+  EXPECT_MATRIX_NEAR(
+      add(1.5, Eigen::MatrixXd(a.triangularView<Eigen::Lower>())), c, 1E-8);
+
+  a_cl.view(stan::math::matrix_cl_view::Upper);
+  c_cl = add(a_cl, 1.5);
+  EXPECT_EQ(c_cl.view(), stan::math::matrix_cl_view::Entire);
+  c = stan::math::from_matrix_cl(c_cl);
+  EXPECT_MATRIX_NEAR(
+      add(Eigen::MatrixXd(a.triangularView<Eigen::Upper>()), 1.5), c, 1E-8);
+
+  a_cl.view(stan::math::matrix_cl_view::Upper);
+  c_cl = add(1.5, a_cl);
+  EXPECT_EQ(c_cl.view(), stan::math::matrix_cl_view::Entire);
+  c = stan::math::from_matrix_cl(c_cl);
+  EXPECT_MATRIX_NEAR(
+      add(1.5, Eigen::MatrixXd(a.triangularView<Eigen::Upper>())), c, 1E-8);
+
+  a_cl.view(stan::math::matrix_cl_view::Entire);
+  c_cl = add(a_cl, 1.5);
+  EXPECT_EQ(c_cl.view(), stan::math::matrix_cl_view::Entire);
+  c = stan::math::from_matrix_cl(c_cl);
+  EXPECT_MATRIX_NEAR(add(a, 1.5), c, 1E-8);
+
+  a_cl.view(stan::math::matrix_cl_view::Entire);
+  c_cl = add(1.5, a_cl);
+  EXPECT_EQ(c_cl.view(), stan::math::matrix_cl_view::Entire);
+  c = stan::math::from_matrix_cl(c_cl);
+  EXPECT_MATRIX_NEAR(add(1.5, a), c, 1E-8);
+}
+
+TEST(OpenCLPrim, add_batch) {
+  // used to represent 5 matrices of size 10x10
+  const int batch_size = 11;
+  const int size = 13;
+  stan::math::matrix_d a(size, size * batch_size);
+  stan::math::matrix_d a_res(size, size);
+  for (int k = 0; k < batch_size; k++) {
+    for (int i = 0; i < size; i++)
+      for (int j = 0; j < size; j++) {
+        a(i, k * size + j) = k;
+      }
+  }
+  stan::math::matrix_cl<double> a_cl(a);
+  stan::math::matrix_cl<double> a_cl_res(size, size);
+  stan::math::opencl_kernels::add_batch(cl::NDRange(size, size), a_cl_res, a_cl,
+                                        size, size, batch_size);
+  a_res = stan::math::from_matrix_cl(a_cl_res);
+  for (int k = 0; k < batch_size; k++) {
+    for (int i = 0; i < size; i++)
+      for (int j = 0; j < size; j++) {
+        a(i, j) += a(i, k * size + j);
+      }
+  }
+  for (int i = 0; i < size; i++) {
+    for (int j = 0; j < size; j++) {
+      EXPECT_EQ(a(i, j), a_res(i, j));
+    }
+  }
+}
+
 TEST(OpenCLPrim, add_aliasing) {
   stan::math::matrix_d d1(3, 3);
   d1 << 1, 2, 3, 4, 5, 6, 7, 8, 9;
   using stan::math::matrix_cl;
-  using stan::math::var;
   using stan::math::var_value;
+  using stan::math::var;
   using varmat_cl = var_value<matrix_cl<double>>;
   varmat_cl d11 = stan::math::to_matrix_cl(d1);
   // Add the same matrix as the left and right hand side

From 1dff379133a706f496e514b67b77fdcd605c6810 Mon Sep 17 00:00:00 2001
From: Stan Jenkins <mc.stanislaw@gmail.com>
Date: Wed, 13 Sep 2023 14:19:26 -0400
Subject: [PATCH 6/9] [Jenkins] auto-formatting by clang-format version
 10.0.0-4ubuntu1

---
 test/unit/math/opencl/rev/add_test.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/unit/math/opencl/rev/add_test.cpp b/test/unit/math/opencl/rev/add_test.cpp
index d64f360030e..aaf591986c4 100644
--- a/test/unit/math/opencl/rev/add_test.cpp
+++ b/test/unit/math/opencl/rev/add_test.cpp
@@ -179,8 +179,8 @@ TEST(OpenCLPrim, add_aliasing) {
   stan::math::matrix_d d1(3, 3);
   d1 << 1, 2, 3, 4, 5, 6, 7, 8, 9;
   using stan::math::matrix_cl;
-  using stan::math::var_value;
   using stan::math::var;
+  using stan::math::var_value;
   using varmat_cl = var_value<matrix_cl<double>>;
   varmat_cl d11 = stan::math::to_matrix_cl(d1);
   // Add the same matrix as the left and right hand side

From 95f40da8adeea16dcb4c25cfe5677fe710af5b60 Mon Sep 17 00:00:00 2001
From: Steve Bronder <sbronder@flatironinstitute.org>
Date: Wed, 13 Sep 2023 15:19:43 -0400
Subject: [PATCH 7/9] revert add test

---
 test/unit/math/opencl/rev/add_test.cpp | 220 +++++++------------------
 1 file changed, 61 insertions(+), 159 deletions(-)

diff --git a/test/unit/math/opencl/rev/add_test.cpp b/test/unit/math/opencl/rev/add_test.cpp
index aaf591986c4..874074968ce 100644
--- a/test/unit/math/opencl/rev/add_test.cpp
+++ b/test/unit/math/opencl/rev/add_test.cpp
@@ -5,175 +5,77 @@
 #include <gtest/gtest.h>
 #include <algorithm>
 
-TEST(OpenCLPrim, add_exceptions) {
+auto add_functor
+    = [](const auto& a, const auto& b) { return stan::math::add(a, b).eval(); };
+
+TEST(OpenCLPrim, add_v_small_zero) {
+  stan::math::vector_d d1(3), d2(3);
+  d1 << 1, 2, 3;
+  d2 << 3, 2, 1;
+  stan::math::test::compare_cpu_opencl_prim_rev(add_functor, d1, d2);
+
+  stan::math::vector_d d0(0);
+  stan::math::test::compare_cpu_opencl_prim_rev(add_functor, d0, d0);
+
+  double d3 = 3.0;
+  stan::math::test::compare_cpu_opencl_prim_rev(add_functor, d1, d3);
+  stan::math::test::compare_cpu_opencl_prim_rev(add_functor, d3, d1);
+  stan::math::test::compare_cpu_opencl_prim_rev(add_functor, d3, d0);
+  stan::math::test::compare_cpu_opencl_prim_rev(add_functor, d0, d3);
+}
+
+TEST(OpenCLPrim, add_rv_small_zero) {
+  stan::math::row_vector_d d1(3), d2(3);
+  d1 << 1, 2, 3;
+  d2 << 3, 2, 1;
+  stan::math::test::compare_cpu_opencl_prim_rev(add_functor, d1, d2);
+
+  stan::math::vector_d d0(0);
+  stan::math::test::compare_cpu_opencl_prim_rev(add_functor, d0, d0);
+
+  double d3 = 3.0;
+  stan::math::test::compare_cpu_opencl_prim_rev(add_functor, d1, d3);
+  stan::math::test::compare_cpu_opencl_prim_rev(add_functor, d3, d1);
+  stan::math::test::compare_cpu_opencl_prim_rev(add_functor, d3, d0);
+  stan::math::test::compare_cpu_opencl_prim_rev(add_functor, d0, d3);
+}
+
+TEST(OpenCLPrim, add_m_small_zero) {
+  stan::math::matrix_d d1(3, 3), d2(3, 3);
+  d1 << 1, 2, 3, 4, 5, 6, 7, 8, 9;
+  d2 << 10, 100, 1000, 0, -10, -12, 2, 4, 8;
+  stan::math::test::compare_cpu_opencl_prim_rev(add_functor, d1, d2);
+
+  stan::math::matrix_d d0(0, 0);
+  stan::math::test::compare_cpu_opencl_prim_rev(add_functor, d0, d0);
+
+  double d3 = 3.0;
+  stan::math::test::compare_cpu_opencl_prim_rev(add_functor, d1, d3);
+  stan::math::test::compare_cpu_opencl_prim_rev(add_functor, d3, d1);
+  stan::math::test::compare_cpu_opencl_prim_rev(add_functor, d3, d0);
+  stan::math::test::compare_cpu_opencl_prim_rev(add_functor, d0, d3);
+}
+
+TEST(OpenCLPrim, add_rev_exceptions) {
+  using stan::math::matrix_cl;
   stan::math::vector_d vd1(2), vd2(3);
-  stan::math::matrix_cl<double> vd11(vd1);
-  stan::math::matrix_cl<double> vd22(vd2);
+  stan::math::var_value<matrix_cl<double>> vd11 = stan::math::to_matrix_cl(vd1);
+  stan::math::var_value<matrix_cl<double>> vd22 = stan::math::to_matrix_cl(vd2);
   EXPECT_THROW(stan::math::add(vd11, vd22), std::invalid_argument);
 
   stan::math::row_vector_d rvd1(2), rvd2(3);
-  stan::math::matrix_cl<double> rvd11(rvd1);
-  stan::math::matrix_cl<double> rvd22(rvd2);
+  stan::math::var_value<matrix_cl<double>> rvd11
+      = stan::math::to_matrix_cl(rvd1);
+  stan::math::var_value<matrix_cl<double>> rvd22
+      = stan::math::to_matrix_cl(rvd2);
   EXPECT_THROW(stan::math::add(rvd11, rvd22), std::invalid_argument);
 
   stan::math::matrix_d md1(2, 2), md2(3, 3);
-  stan::math::matrix_cl<double> md11(md1);
-  stan::math::matrix_cl<double> md22(md2);
+  stan::math::var_value<matrix_cl<double>> md11 = stan::math::to_matrix_cl(md1);
+  stan::math::var_value<matrix_cl<double>> md22 = stan::math::to_matrix_cl(md2);
   EXPECT_THROW(stan::math::add(md11, md22), std::invalid_argument);
 }
 
-TEST(OpenCLPrim, add_tri_value_check) {
-  Eigen::MatrixXd a(3, 3);
-  a << 1, 2, 3, 4, 5, 6, 7, 8, 9;
-  Eigen::MatrixXd b = Eigen::MatrixXd::Ones(3, 3) * -3;
-
-  stan::math::matrix_cl<double> a_cl(a);
-  stan::math::matrix_cl<double> b_cl(b);
-  stan::math::matrix_cl<double> c_cl(3, 3);
-  stan::math::matrix_cl<double> c_cl_fun(3, 3);
-  Eigen::MatrixXd c(3, 3);
-
-  a_cl.view(stan::math::matrix_cl_view::Lower);
-  b_cl.view(stan::math::matrix_cl_view::Lower);
-  c_cl = a_cl + b_cl;
-  EXPECT_EQ(c_cl.view(), stan::math::matrix_cl_view::Lower);
-  c = stan::math::from_matrix_cl(c_cl);
-  EXPECT_MATRIX_NEAR((Eigen::MatrixXd(a.triangularView<Eigen::Lower>())
-                      + Eigen::MatrixXd(b.triangularView<Eigen::Lower>())),
-                     c, 1E-8);
-
-  c_cl_fun = add(a_cl, b_cl);
-  EXPECT_EQ(c_cl_fun.view(), stan::math::matrix_cl_view::Lower);
-  c = stan::math::from_matrix_cl(c_cl_fun);
-  EXPECT_MATRIX_NEAR((Eigen::MatrixXd(a.triangularView<Eigen::Lower>())
-                      + Eigen::MatrixXd(b.triangularView<Eigen::Lower>())),
-                     c, 1E-8);
-
-  a_cl.view(stan::math::matrix_cl_view::Lower);
-  b_cl.view(stan::math::matrix_cl_view::Upper);
-  c_cl = a_cl + b_cl;
-  EXPECT_EQ(c_cl.view(), stan::math::matrix_cl_view::Entire);
-  c = stan::math::from_matrix_cl(c_cl);
-  EXPECT_MATRIX_NEAR((Eigen::MatrixXd(a.triangularView<Eigen::Lower>())
-                      + Eigen::MatrixXd(b.triangularView<Eigen::Upper>())),
-                     c, 1E-8);
-
-  c_cl_fun = add(a_cl, b_cl);
-  EXPECT_EQ(c_cl_fun.view(), stan::math::matrix_cl_view::Entire);
-  c = stan::math::from_matrix_cl(c_cl_fun);
-  EXPECT_MATRIX_NEAR((Eigen::MatrixXd(a.triangularView<Eigen::Lower>())
-                      + Eigen::MatrixXd(b.triangularView<Eigen::Upper>())),
-                     c, 1E-8);
-
-  a_cl.view(stan::math::matrix_cl_view::Upper);
-  b_cl.view(stan::math::matrix_cl_view::Lower);
-  c_cl = a_cl + b_cl;
-  EXPECT_EQ(c_cl.view(), stan::math::matrix_cl_view::Entire);
-  c = stan::math::from_matrix_cl(c_cl);
-  EXPECT_MATRIX_NEAR((Eigen::MatrixXd(a.triangularView<Eigen::Upper>())
-                      + Eigen::MatrixXd(b.triangularView<Eigen::Lower>())),
-                     c, 1E-8);
-
-  c_cl_fun = add(a_cl, b_cl);
-  EXPECT_EQ(c_cl_fun.view(), stan::math::matrix_cl_view::Entire);
-  c = stan::math::from_matrix_cl(c_cl_fun);
-  EXPECT_MATRIX_NEAR((Eigen::MatrixXd(a.triangularView<Eigen::Upper>())
-                      + Eigen::MatrixXd(b.triangularView<Eigen::Lower>())),
-                     c, 1E-8);
-
-  a_cl.view(stan::math::matrix_cl_view::Entire);
-  b_cl.view(stan::math::matrix_cl_view::Lower);
-  c_cl = a_cl + b_cl;
-  EXPECT_EQ(c_cl.view(), stan::math::matrix_cl_view::Entire);
-  c = stan::math::from_matrix_cl(c_cl);
-  EXPECT_MATRIX_NEAR((a + Eigen::MatrixXd(b.triangularView<Eigen::Lower>())), c,
-                     1E-8);
-
-  c_cl_fun = add(a_cl, b_cl);
-  EXPECT_EQ(c_cl_fun.view(), stan::math::matrix_cl_view::Entire);
-  c = stan::math::from_matrix_cl(c_cl_fun);
-  EXPECT_MATRIX_NEAR((a + Eigen::MatrixXd(b.triangularView<Eigen::Lower>())), c,
-                     1E-8);
-}
-
-TEST(OpenCLPrim, add_tri_scalar_value_check) {
-  Eigen::MatrixXd a(3, 3);
-  a << 1, 2, 3, 4, 5, 6, 7, 8, 9;
-  stan::math::matrix_cl<double> a_cl(a);
-  stan::math::matrix_cl<double> c_cl(3, 3);
-  Eigen::MatrixXd c(3, 3);
-
-  using stan::math::add;
-  a_cl.view(stan::math::matrix_cl_view::Lower);
-  c_cl = add(a_cl, 1.5);
-  EXPECT_EQ(c_cl.view(), stan::math::matrix_cl_view::Entire);
-  c = stan::math::from_matrix_cl(c_cl);
-  EXPECT_MATRIX_NEAR(
-      add(Eigen::MatrixXd(a.triangularView<Eigen::Lower>()), 1.5), c, 1E-8);
-
-  a_cl.view(stan::math::matrix_cl_view::Lower);
-  c_cl = add(1.5, a_cl);
-  EXPECT_EQ(c_cl.view(), stan::math::matrix_cl_view::Entire);
-  c = stan::math::from_matrix_cl(c_cl);
-  EXPECT_MATRIX_NEAR(
-      add(1.5, Eigen::MatrixXd(a.triangularView<Eigen::Lower>())), c, 1E-8);
-
-  a_cl.view(stan::math::matrix_cl_view::Upper);
-  c_cl = add(a_cl, 1.5);
-  EXPECT_EQ(c_cl.view(), stan::math::matrix_cl_view::Entire);
-  c = stan::math::from_matrix_cl(c_cl);
-  EXPECT_MATRIX_NEAR(
-      add(Eigen::MatrixXd(a.triangularView<Eigen::Upper>()), 1.5), c, 1E-8);
-
-  a_cl.view(stan::math::matrix_cl_view::Upper);
-  c_cl = add(1.5, a_cl);
-  EXPECT_EQ(c_cl.view(), stan::math::matrix_cl_view::Entire);
-  c = stan::math::from_matrix_cl(c_cl);
-  EXPECT_MATRIX_NEAR(
-      add(1.5, Eigen::MatrixXd(a.triangularView<Eigen::Upper>())), c, 1E-8);
-
-  a_cl.view(stan::math::matrix_cl_view::Entire);
-  c_cl = add(a_cl, 1.5);
-  EXPECT_EQ(c_cl.view(), stan::math::matrix_cl_view::Entire);
-  c = stan::math::from_matrix_cl(c_cl);
-  EXPECT_MATRIX_NEAR(add(a, 1.5), c, 1E-8);
-
-  a_cl.view(stan::math::matrix_cl_view::Entire);
-  c_cl = add(1.5, a_cl);
-  EXPECT_EQ(c_cl.view(), stan::math::matrix_cl_view::Entire);
-  c = stan::math::from_matrix_cl(c_cl);
-  EXPECT_MATRIX_NEAR(add(1.5, a), c, 1E-8);
-}
-
-TEST(OpenCLPrim, add_batch) {
-  // used to represent 5 matrices of size 10x10
-  const int batch_size = 11;
-  const int size = 13;
-  stan::math::matrix_d a(size, size * batch_size);
-  stan::math::matrix_d a_res(size, size);
-  for (int k = 0; k < batch_size; k++) {
-    for (int i = 0; i < size; i++)
-      for (int j = 0; j < size; j++) {
-        a(i, k * size + j) = k;
-      }
-  }
-  stan::math::matrix_cl<double> a_cl(a);
-  stan::math::matrix_cl<double> a_cl_res(size, size);
-  stan::math::opencl_kernels::add_batch(cl::NDRange(size, size), a_cl_res, a_cl,
-                                        size, size, batch_size);
-  a_res = stan::math::from_matrix_cl(a_cl_res);
-  for (int k = 0; k < batch_size; k++) {
-    for (int i = 0; i < size; i++)
-      for (int j = 0; j < size; j++) {
-        a(i, j) += a(i, k * size + j);
-      }
-  }
-  for (int i = 0; i < size; i++) {
-    for (int j = 0; j < size; j++) {
-      EXPECT_EQ(a(i, j), a_res(i, j));
-    }
-  }
-}
 
 TEST(OpenCLPrim, add_aliasing) {
   stan::math::matrix_d d1(3, 3);

From 03eb22be1368e262c8407606ac8d75bd25108f21 Mon Sep 17 00:00:00 2001
From: Stan Jenkins <mc.stanislaw@gmail.com>
Date: Wed, 13 Sep 2023 15:21:08 -0400
Subject: [PATCH 8/9] [Jenkins] auto-formatting by clang-format version
 10.0.0-4ubuntu1

---
 test/unit/math/opencl/rev/add_test.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/test/unit/math/opencl/rev/add_test.cpp b/test/unit/math/opencl/rev/add_test.cpp
index 874074968ce..f678fbc255c 100644
--- a/test/unit/math/opencl/rev/add_test.cpp
+++ b/test/unit/math/opencl/rev/add_test.cpp
@@ -76,7 +76,6 @@ TEST(OpenCLPrim, add_rev_exceptions) {
   EXPECT_THROW(stan::math::add(md11, md22), std::invalid_argument);
 }
 
-
 TEST(OpenCLPrim, add_aliasing) {
   stan::math::matrix_d d1(3, 3);
   d1 << 1, 2, 3, 4, 5, 6, 7, 8, 9;

From 29644f65785a44596f1e059954a6c76f9f518772 Mon Sep 17 00:00:00 2001
From: Steve Bronder <sbronder@flatironinstitute.org>
Date: Thu, 14 Sep 2023 17:12:38 -0400
Subject: [PATCH 9/9] add inline to operator<< for kernel_parts

---
 stan/math/opencl/kernel_generator/operation_cl.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/stan/math/opencl/kernel_generator/operation_cl.hpp b/stan/math/opencl/kernel_generator/operation_cl.hpp
index f5995484da0..43a50b59654 100644
--- a/stan/math/opencl/kernel_generator/operation_cl.hpp
+++ b/stan/math/opencl/kernel_generator/operation_cl.hpp
@@ -61,7 +61,7 @@ struct kernel_parts {
             args + other.args};
   }
 
-  kernel_parts operator+=(const kernel_parts& other) {
+  kernel_parts& operator+=(const kernel_parts& other) {
     includes += other.includes;
     declarations += other.declarations;
     initialization += other.initialization;
@@ -75,7 +75,7 @@ struct kernel_parts {
   }
 };
 
-std::ostream& operator<<(std::ostream& os, kernel_parts& parts) {
+inline std::ostream& operator<<(std::ostream& os, kernel_parts& parts) {
   os << "args:" << std::endl;
   os << parts.args.substr(0, parts.args.size() - 2) << std::endl;
   os << "Decl:" << std::endl;