From 3c26b49899e47bf033ca958cf8473b55c4e364d0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= <tadej.c@gmail.com>
Date: Tue, 8 Jun 2021 10:22:22 +0200
Subject: [PATCH 01/15] double double

---
 stan/math/opencl/copy.hpp                     |   2 +-
 stan/math/opencl/double_d.hpp                 | 322 ++++++++++++++++++
 .../kernel_generator/matrix_cl_conversion.hpp |   4 +-
 stan/math/opencl/matrix_cl.hpp                |  11 +-
 stan/math/opencl/zeros_strict_tri.hpp         |   2 +-
 test/unit/math/opencl/double_d_test.cpp       | 179 ++++++++++
 6 files changed, 507 insertions(+), 13 deletions(-)
 create mode 100644 stan/math/opencl/double_d.hpp
 create mode 100644 test/unit/math/opencl/double_d_test.cpp
diff --git a/stan/math/opencl/copy.hpp b/stan/math/opencl/copy.hpp
index 574d1734b0a..62dd7915879 100644
--- a/stan/math/opencl/copy.hpp
+++ b/stan/math/opencl/copy.hpp
@@ -56,7 +56,7 @@ inline matrix_cl<scalar_type_t<T>> to_matrix_cl(T&& src) {
  * @return Eigen matrix with a copy of the data in the source matrix
  */
 template <typename T_ret, typename T,
-          require_eigen_vt<std::is_arithmetic, T_ret>* = nullptr,
+          require_eigen_t<T_ret>* = nullptr,
           require_matrix_cl_t<T>* = nullptr,
           require_st_same<T_ret, T>* = nullptr>
 inline auto from_matrix_cl(const T& src) {
diff --git a/stan/math/opencl/double_d.hpp b/stan/math/opencl/double_d.hpp
new file mode 100644
index 00000000000..d2587801246
--- /dev/null
+++ b/stan/math/opencl/double_d.hpp
@@ -0,0 +1,322 @@
+#ifndef STAN_MATH_OPENCL_DOUBLE_D_HPP
+#define STAN_MATH_OPENCL_DOUBLE_D_HPP
+
+#include <stan/math/opencl/stringify.hpp>
+
+#include <string>
+#include <cmath>
+#include <cfloat>
+#include <limits>
+
+/**
+ * Alternative stringify, that also exposes the code for use in C++ host code.
+ */
+#define STRINGIFY2(A) \
+#A;                 \
+  A
+namespace stan {
+namespace math {
+namespace internal {
+
+/**
+ * Double double - a 128 bit floating point number defined as an exact sum of 2
+ * doubles. `low` should be less than 1 ulp of `high`.
+ */
+typedef struct double_d {
+  double high;
+  double low;
+
+  double_d() {}
+  double_d(double a) {
+    high = a;
+    low = 0;
+  }
+  double_d(double h, double l) {
+    high = h;
+    low = l;
+  }
+} double_d;
+
+using std::copysign;
+using std::fabs;
+using std::isfinite;
+using std::isinf;
+using std::isnan;
+using std::ldexp;
+
+// \cond
+const char* double_d_src = STRINGIFY(
+    // \endcond
+    typedef struct {
+      double high;
+      double low;
+    } double_d;
+    // \cond
+    )
+    STRINGIFY2(
+        // \endcond
+
+        //    inline double_d mul_d_d(double a, double b) {
+        //      double_d res;
+        //      volatile double temp = a * b;
+        //      res.high = temp;
+        //      res.low = fma(a, b, -temp);
+        //      return res;
+        //    }
+
+        inline double_d mul_d_d(double a, double b) {
+          double_d res;
+          const double C = (2 << 26) + 1;
+          double p = a * C;
+          if (!isfinite(p)) {
+            res.high = p;
+            res.low = 0;
+            return res;
+          }
+          double hx = a - p + p;
+          double tx = a - hx;
+          p = b * C;
+          if (!isfinite(p)) {
+            res.high = p;
+            res.low = 0;
+            return res;
+          }
+          double hy = b - p + p;
+          double ty = b - hy;
+          p = hx * hy;
+          double q = hx * ty + tx * hy;
+          res.high = p + q;
+          if (isfinite(res.high)) {
+            res.low = p - res.high + q + tx * ty;
+          } else {
+            res.low = 0;
+          }
+          return res;
+        }
+
+        inline double_d neg(double_d a) {
+          double_d res;
+          res.high = -a.high;
+          res.low = -a.low;
+          return res;
+        }
+
+        inline double_d add_dd_dd(double_d a, double_d b) {
+          double_d res;
+          double high = a.high + b.high;
+          if (isfinite(high)) {
+            double low;
+            if (fabs(a.high) > fabs(b.high)) {
+              low = a.high - high + b.high;
+            } else {
+              low = b.high - high + a.high;
+            }
+            low += a.low + b.low;
+            res.high = high + low;
+            res.low = high - res.high + low;
+          } else {
+            res.high = high;
+            res.low = 0;
+          }
+          return res;
+        }
+
+        inline double_d add_dd_d(double_d a, double b) {
+          double_d res;
+          double high = a.high + b;
+          if (isfinite(high)) {
+            double low;
+            if (fabs(a.high) > fabs(b)) {
+              low = a.high - high + b;
+            } else {
+              low = b - high + a.high;
+            }
+            low += a.low;
+            res.high = high + low;
+            res.low = high - res.high + low;
+          } else {
+            res.high = high;
+            res.low = 0;
+          }
+          return res;
+        }
+
+        inline double_d sub_dd_dd(double_d a, double_d b) {
+          return add_dd_dd(a, neg(b));
+        }
+
+        inline double_d sub_dd_d(double_d a, double b) {
+          return add_dd_d(a, -b);
+        }
+
+        inline double_d sub_d_dd(double a, double_d b) {
+          return add_dd_d(neg(b), a);
+        }
+
+        inline double_d mul_dd_dd(double_d a, double_d b) {
+          double_d high = mul_d_d(a.high, b.high);
+          double_d res;
+          if (isfinite(high.high)) {
+            high.low += a.high * b.low + a.low * b.high;
+            res.high = high.high + high.low;
+            res.low = high.high - res.high + high.low;
+          } else {
+            high.low = 0;
+            return high;
+          }
+          return res;
+        }
+
+        inline double_d mul_dd_d(double_d a, double b) {
+          double_d high = mul_d_d(a.high, b);
+          double_d res;
+          if (isfinite(high.high)) {
+            high.low += a.low * b;
+            res.high = high.high + high.low;
+            res.low = high.high - res.high + high.low;
+            return res;
+          } else {
+            high.low = 0;
+            return high;
+          }
+        }
+
+        inline double_d div_dd_dd(double_d a, double_d b) {
+          double high = a.high / b.high;
+          double_d res;
+          if (isfinite(high)) {
+            double_d u = mul_d_d(high, b.high);
+            if (isfinite(u.high)) {
+              double low
+                  = (a.high - u.high - u.low + a.low - high * b.low) / b.high;
+              res.high = high + low;
+              res.low = high - res.high + low;
+              return res;
+            }
+          }
+          res.high = high;
+          res.low = 0;
+          return res;
+        }
+
+        inline double_d div_dd_d(double_d a, double b) {
+          double high = a.high / b;
+          double_d res;
+          if (isfinite(high)) {
+            double_d u = mul_d_d(high, b);
+            double low = (a.high - u.high - u.low + a.low) / b;
+            res.high = high + low;
+            res.low = high - res.high + low;
+          } else {
+            res.high = high;
+            res.low = 0;
+          }
+          return res;
+        }
+
+        inline double_d div_d_dd(double a, double_d b) {
+          double high = a / b.high;
+          double_d res;
+          if (isfinite(high)) {
+            double_d u = mul_d_d(high, b.high);
+            double low = (a - u.high - u.low - high * b.low) / b.high;
+            res.high = high + low;
+            res.low = high - res.high + low;
+          } else {
+            res.high = high;
+            res.low = 0;
+          }
+          return res;
+        }
+
+        inline double copysign_d_dd(double a, double_d b) {
+          return copysign(a, b.high);
+        }
+
+        inline double_d abs_dd(double_d a) { return a.high > 0 ? a : neg(a); }
+
+        inline bool isnan_dd(double_d a) { return isnan(a.high); }
+
+        inline bool isinf_dd(double_d a) { return isinf(a.high); }
+
+        inline bool lt_dd_dd(double_d a, double_d b) {
+          return a.high < b.high || (a.high == b.high && a.low < b.low);
+        }
+
+        inline bool lt_d_dd(double a, double_d b) {
+          return lt_dd_dd((double_d){a, 0}, b);
+        }
+
+        inline bool lt_dd_d(double_d a, double b) {
+          return lt_dd_dd(a, (double_d){b, 0});
+        }
+
+        inline bool gt_dd_dd(double_d a, double_d b) {
+          return a.high > b.high || (a.high == b.high && a.low > b.low);
+        }
+
+        inline bool gt_d_dd(double a, double_d b) {
+          return gt_dd_dd((double_d){a, 0}, b);
+        }
+
+        inline bool gt_dd_d(double_d a, double b) {
+          return gt_dd_dd(a, (double_d){b, 0});
+        }
+
+        inline bool le_dd_dd(double_d a, double_d b) {
+          return !gt_dd_dd(a, b);
+        } inline bool le_d_dd(double a, double_d b) {
+          return !gt_d_dd(a, b);
+        } inline bool le_dd_d(double_d a, double b) { return !gt_dd_d(a, b); }
+
+        inline bool ge_dd_dd(double_d a, double_d b) {
+          return !lt_dd_dd(a, b);
+        } inline bool ge_d_dd(double a, double_d b) {
+          return !lt_d_dd(a, b);
+        } inline bool ge_dd_d(double_d a, double b) { return !lt_dd_d(a, b); }
+
+        // \cond
+    );
+// \endcond
+
+inline double_d operator+(double_d a, double b) { return add_dd_d(a, b); }
+inline double_d operator+(double a, double_d b) { return add_dd_d(b, a); }
+inline double_d operator+(double_d a, double_d b) { return add_dd_dd(a, b); }
+
+inline double_d operator-(double a, double_d b) { return sub_d_dd(a, b); }
+inline double_d operator-(double_d a, double b) { return sub_dd_d(a, b); }
+inline double_d operator-(double_d a, double_d b) { return sub_dd_dd(a, b); }
+
+inline double_d operator*(double_d a, double b) { return mul_dd_d(a, b); }
+inline double_d operator*(double a, double_d b) { return mul_dd_d(b, a); }
+inline double_d operator*(double_d a, double_d b) { return mul_dd_dd(a, b); }
+
+inline double_d operator/(double_d a, double b) { return div_dd_d(a, b); }
+inline double_d operator/(double a, double_d b) { return div_d_dd(a, b); }
+inline double_d operator/(double_d a, double_d b) { return div_dd_dd(a, b); }
+
+inline double_d operator-(double_d a) { return neg(a); }
+
+inline bool operator<(double_d a, double_d b) { return lt_dd_dd(a, b); }
+inline bool operator<(double a, double_d b) { return lt_d_dd(a, b); }
+inline bool operator<(double_d a, double b) { return lt_dd_d(a, b); }
+inline bool operator>(double_d a, double_d b) { return gt_dd_dd(a, b); }
+inline bool operator>(double a, double_d b) { return gt_d_dd(a, b); }
+inline bool operator>(double_d a, double b) { return gt_dd_d(a, b); }
+inline bool operator<=(double_d a, double_d b) { return le_dd_dd(a, b); }
+inline bool operator<=(double a, double_d b) { return le_d_dd(a, b); }
+inline bool operator<=(double_d a, double b) { return le_dd_d(a, b); }
+inline bool operator>=(double_d a, double_d b) { return ge_dd_dd(a, b); }
+inline bool operator>=(double a, double_d b) { return ge_d_dd(a, b); }
+inline bool operator>=(double_d a, double b) { return ge_dd_d(a, b); }
+
+inline double_d fabs(double_d a) { return abs_dd(a); }
+inline bool isnan(double_d a) { return isnan_dd(a); }
+inline bool isinf(double_d a) { return isinf_dd(a); }
+inline double copysign(double a, double_d b) { return copysign_d_dd(a, b); }
+
+}  // namespace internal
+}  // namespace math
+}  // namespace stan
+
+#endif
diff --git a/stan/math/opencl/kernel_generator/matrix_cl_conversion.hpp b/stan/math/opencl/kernel_generator/matrix_cl_conversion.hpp
index c91e6a62a05..02bdbde82d1 100644
--- a/stan/math/opencl/kernel_generator/matrix_cl_conversion.hpp
+++ b/stan/math/opencl/kernel_generator/matrix_cl_conversion.hpp
@@ -14,7 +14,7 @@ namespace math {
 template <typename T>
 template <typename Expr,
           require_all_kernel_expressions_and_none_scalar_t<Expr>*>
-matrix_cl<T, require_arithmetic_t<T>>::matrix_cl(const Expr& expresion)
+matrix_cl<T>::matrix_cl(const Expr& expresion)
     : rows_(0), cols_(0) {
   results(*this) = expressions(expresion);
 }
@@ -22,7 +22,7 @@ matrix_cl<T, require_arithmetic_t<T>>::matrix_cl(const Expr& expresion)
 template <typename T>
 template <typename Expr,
           require_all_kernel_expressions_and_none_scalar_t<Expr>*>
-matrix_cl<T>& matrix_cl<T, require_arithmetic_t<T>>::operator=(
+matrix_cl<T>& matrix_cl<T>::operator=(
     const Expr& expresion) {
   results(*this) = expressions(expresion);
   return *this;
diff --git a/stan/math/opencl/matrix_cl.hpp b/stan/math/opencl/matrix_cl.hpp
index e4949d69761..b12cd6bd4f4 100644
--- a/stan/math/opencl/matrix_cl.hpp
+++ b/stan/math/opencl/matrix_cl.hpp
@@ -31,7 +31,7 @@ namespace math {
  *  @{
  */
 
-template <typename, typename = void>
+template <typename>
 class matrix_cl;
 
 /**
@@ -39,7 +39,7 @@ class matrix_cl;
  * @tparam T an arithmetic type for the type stored in the OpenCL buffer.
  */
 template <typename T>
-class matrix_cl<T, require_arithmetic_t<T>> : public matrix_cl_base {
+class matrix_cl : public matrix_cl_base {
  private:
   cl::Buffer buffer_cl_;  // Holds the allocated memory on the device
   int rows_{0};           // Number of rows.
@@ -624,13 +624,6 @@ class matrix_cl<T, require_arithmetic_t<T>> : public matrix_cl_base {
     delete static_cast<U*>(container);
   }
 };
-
-template <typename T>
-using matrix_cl_prim = matrix_cl<T, require_arithmetic_t<T>>;
-
-template <typename T>
-using matrix_cl_fp = matrix_cl<T, require_floating_point_t<T>>;
-
 /** @}*/
 
 }  // namespace math
diff --git a/stan/math/opencl/zeros_strict_tri.hpp b/stan/math/opencl/zeros_strict_tri.hpp
index 66ff85c64f4..30770a7aef9 100644
--- a/stan/math/opencl/zeros_strict_tri.hpp
+++ b/stan/math/opencl/zeros_strict_tri.hpp
@@ -29,7 +29,7 @@ namespace math {
  */
 template <typename T>
 template <matrix_cl_view matrix_view>
-inline void matrix_cl<T, require_arithmetic_t<T>>::zeros_strict_tri() try {
+inline void matrix_cl<T>::zeros_strict_tri() try {
   if (matrix_view == matrix_cl_view::Entire) {
     invalid_argument(
         "zeros_strict_tri", "matrix_view",
diff --git a/test/unit/math/opencl/double_d_test.cpp b/test/unit/math/opencl/double_d_test.cpp
new file mode 100644
index 00000000000..98764b25194
--- /dev/null
+++ b/test/unit/math/opencl/double_d_test.cpp
@@ -0,0 +1,179 @@
+
+#include <stan/math/opencl/double_d.hpp>
+//#include <test/unit/math/expect_near_rel.hpp>
+#include <gtest/gtest.h>
+#include <limits>
+#include <Eigen/Core>
+
+#define EXPECT_NORMALIZED(a) \
+  EXPECT_LT(std::abs(a.low), \
+            std::abs(a.high) * std::numeric_limits<double>::epsilon());
+
+TEST(double_d, add_dd_dd_test) {
+  using stan::math::internal::add_dd_dd;
+  using stan::math::internal::double_d;
+  double eps = std::numeric_limits<double>::epsilon();
+  double h_eps = eps * 0.5;
+  double_d a{1.0, h_eps};
+  double_d c{h_eps, 0.0};
+  double_d d{-1.0, -h_eps + eps * h_eps};
+
+  // simple
+  double_d res = add_dd_dd(a, a);
+  EXPECT_NORMALIZED(res);
+  EXPECT_TRUE(res.high == 2.0);
+  EXPECT_TRUE(res.low == eps);
+
+  // carry
+  res = add_dd_dd(a, c);
+  EXPECT_NORMALIZED(res);
+  EXPECT_TRUE(res.high == 1.0 + eps);
+  EXPECT_TRUE(res.low == 0.0);
+
+  // cancelation
+  res = add_dd_dd(a, d);
+  EXPECT_NORMALIZED(res);
+  EXPECT_TRUE(res.high == eps * h_eps);
+  EXPECT_TRUE(res.low == 0.0);
+}
+
+TEST(double_d, mul_dd_dd_test) {
+  using stan::math::internal::double_d;
+  using stan::math::internal::mul_dd_dd;
+  double eps = std::numeric_limits<double>::epsilon();
+  double h_eps = eps * 0.5;
+  double_d a{1.0, h_eps * 0.001};
+  double_d b{h_eps, h_eps * h_eps * h_eps * h_eps};
+  double_d c{1.0, h_eps};
+  double_d d{1.0, -h_eps};
+
+  // simple
+  double_d res = mul_dd_dd(a, b);
+  EXPECT_NORMALIZED(res);
+  EXPECT_TRUE(res.high == h_eps);
+  EXPECT_TRUE(res.low == h_eps * h_eps * 0.001);
+
+  // carry
+  res = mul_dd_dd(c, c);
+  EXPECT_NORMALIZED(res);
+  EXPECT_TRUE(res.high == 1.0 + eps);
+  EXPECT_TRUE(res.low == 0.0);
+
+  // cancelation
+  res = mul_dd_dd(c, d);
+  EXPECT_NORMALIZED(res);
+  EXPECT_TRUE(res.high == 1.0);
+  EXPECT_TRUE(res.low == 0.0);
+}
+
+TEST(double_d, div_dd_dd_test) {
+  using stan::math::internal::div_dd_dd;
+  using stan::math::internal::double_d;
+  double eps = std::numeric_limits<double>::epsilon();
+  double h_eps = eps * 0.5;
+  double_d a{1.0, h_eps};
+  double_d b{1.0, h_eps * (1 - eps)};
+  double_d c{0.0, 0.0};
+  double_d d{1.0, -h_eps};
+
+  // simple
+  double_d res = div_dd_dd(a, a);
+  EXPECT_NORMALIZED(res);
+  EXPECT_TRUE(res.high == 1.0);
+  EXPECT_TRUE(res.low == 0.0);
+
+  res = div_dd_dd(a, b);
+  EXPECT_NORMALIZED(res);
+  EXPECT_TRUE(res.high == 1.0);
+  EXPECT_TRUE(res.low == h_eps * eps);
+
+  // div by zero
+  res = div_dd_dd(a, c);
+  EXPECT_TRUE(res.high == std::numeric_limits<double>::infinity());
+  EXPECT_TRUE(res.low == 0);
+}
+
+// this is the best test we have, but it relies on the nonstandard gcc extension
+// quadmath, so it is not run by default
+#ifdef GCC_QUADMATH_TEST
+#include <quadmath.h>
+
+#define EXPECT_DD_F128_EQ(dd, f128) \
+  tmp = (dd);                       \
+  EXPECT_NEAR((__float128)tmp.high + tmp.low, (f128), 1e-30 * 1e30)
+
+TEST(double_d, all) {
+  using stan::math::internal::double_d;
+  for (int i = 0; i < 10000000; i++) {
+    double_d tmp;
+    double high = Eigen::MatrixXd::Random(0, 0).coeff(0, 0) * 1e30;
+    double low = Eigen::MatrixXd::Random(0, 0).coeff(0, 0) * 1e-20 * 1e30;
+    double_d dd = high;
+    dd.low = low;
+    __float128 f128 = high;
+    f128 += low;
+    EXPECT_DD_F128_EQ(dd, f128);
+
+    double high2 = Eigen::MatrixXd::Random(0, 0).coeff(0, 0);
+    double low2 = Eigen::MatrixXd::Random(0, 0).coeff(0, 0) * 1e-20;
+    double_d dd2 = high2;
+    dd2.low = low2;
+    __float128 f1282 = high2;
+    f1282 += low2;
+    EXPECT_DD_F128_EQ(dd2, f1282);
+
+    EXPECT_DD_F128_EQ(dd + dd2, f128 + f1282);
+    EXPECT_DD_F128_EQ(dd - dd2, f128 - f1282);
+    EXPECT_DD_F128_EQ(dd * dd2, f128 * f1282);
+    EXPECT_DD_F128_EQ(dd / dd2, f128 / f1282);
+
+    __float128 f1283 = high;
+    __float128 f1284 = high2;
+    EXPECT_DD_F128_EQ(stan::math::internal::mul_d_d(high, high2),
+                      f1283 * f1284);
+  }
+}
+#endif
+
+#ifdef STAN_OPENCL
+
+#include <stan/math/opencl/prim.hpp>
+static const std::string double_d_test_kernel_code
+    = STRINGIFY(__kernel void double_d_test_kernel(__global double_d *C,
+                                                   const __global double_d *A,
+                                                   const __global double *B) {
+        const int i = get_global_id(0);
+        C[i] = mul_dd_d(A[i], B[i]);
+      });
+
+const stan::math::opencl_kernels::kernel_cl<
+    stan::math::opencl_kernels::out_buffer,
+    stan::math::opencl_kernels::in_buffer,
+    stan::math::opencl_kernels::in_buffer>
+    double_d_test_kernel("double_d_test_kernel",
+                         {stan::math::internal::double_d_src,
+                          double_d_test_kernel_code});
+
+TEST(double_d, opencl) {
+  using stan::math::internal::double_d;
+  using VectorXdd = Eigen::Matrix<double_d, -1, 1>;
+  int n = 10;
+  VectorXdd a(n);
+  Eigen::VectorXd b = Eigen::VectorXd::Random(n);
+  for (int i = 0; i < n; i++) {
+    a[i].high = Eigen::VectorXd::Random(1)[0];
+    a[i].low = Eigen::VectorXd::Random(1)[0] * 1e-17;
+  }
+  stan::math::matrix_cl<double_d> a_cl(a);
+  stan::math::matrix_cl<double> b_cl(b);
+  stan::math::matrix_cl<double_d> c_cl(n, 1);
+  double_d_test_kernel(cl::NDRange(n), c_cl, a_cl, b_cl);
+  VectorXdd c = stan::math::from_matrix_cl(c_cl);
+  for (int i = 0; i < n; i++) {
+    double_d correct = a[i] * b[i];
+    EXPECT_EQ(c[i].high, correct.high);
+    EXPECT_NEAR(c[i].low, correct.low, 1e-30);
+  }
+}
+
+#endif

From b5824c2b0f15872daee1a87b854727d33285df6c Mon Sep 17 00:00:00 2001
From: Stan Jenkins <mc.stanislaw@gmail.com>
Date: Tue, 8 Jun 2021 08:59:30 +0000
Subject: [PATCH 02/15] [Jenkins] auto-formatting by clang-format version
 6.0.0-1ubuntu2~16.04.1 (tags/RELEASE_600/final)

---
 stan/math/opencl/copy.hpp                                  | 3 +--
 stan/math/opencl/kernel_generator/matrix_cl_conversion.hpp | 6 ++----
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/stan/math/opencl/copy.hpp b/stan/math/opencl/copy.hpp
index 62dd7915879..b61c3f127c8 100644
--- a/stan/math/opencl/copy.hpp
+++ b/stan/math/opencl/copy.hpp
@@ -55,8 +55,7 @@ inline matrix_cl<scalar_type_t<T>> to_matrix_cl(T&& src) {
  * @param src source matrix on the OpenCL device
  * @return Eigen matrix with a copy of the data in the source matrix
  */
-template <typename T_ret, typename T,
-          require_eigen_t<T_ret>* = nullptr,
+template <typename T_ret, typename T, require_eigen_t<T_ret>* = nullptr,
           require_matrix_cl_t<T>* = nullptr,
           require_st_same<T_ret, T>* = nullptr>
 inline auto from_matrix_cl(const T& src) {
diff --git a/stan/math/opencl/kernel_generator/matrix_cl_conversion.hpp b/stan/math/opencl/kernel_generator/matrix_cl_conversion.hpp
index 02bdbde82d1..fd1455ec91f 100644
--- a/stan/math/opencl/kernel_generator/matrix_cl_conversion.hpp
+++ b/stan/math/opencl/kernel_generator/matrix_cl_conversion.hpp
@@ -14,16 +14,14 @@ namespace math {
 template <typename T>
 template <typename Expr,
           require_all_kernel_expressions_and_none_scalar_t<Expr>*>
-matrix_cl<T>::matrix_cl(const Expr& expresion)
-    : rows_(0), cols_(0) {
+matrix_cl<T>::matrix_cl(const Expr& expresion) : rows_(0), cols_(0) {
   results(*this) = expressions(expresion);
 }
 
 template <typename T>
 template <typename Expr,
           require_all_kernel_expressions_and_none_scalar_t<Expr>*>
-matrix_cl<T>& matrix_cl<T>::operator=(
-    const Expr& expresion) {
+matrix_cl<T>& matrix_cl<T>::operator=(const Expr& expresion) {
   results(*this) = expressions(expresion);
   return *this;
 }

From d69b95fcc4cae1fe9c1d89c5665be18a1e26c80b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= <tadej.c@gmail.com>
Date: Wed, 9 Jun 2021 14:43:57 +0200
Subject: [PATCH 03/15] fix cpplint

---
 stan/math/opencl/double_d.hpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/stan/math/opencl/double_d.hpp b/stan/math/opencl/double_d.hpp
index d2587801246..224b96a58ee 100644
--- a/stan/math/opencl/double_d.hpp
+++ b/stan/math/opencl/double_d.hpp
@@ -274,7 +274,6 @@ const char* double_d_src = STRINGIFY(
         } inline bool ge_d_dd(double a, double_d b) {
           return !lt_d_dd(a, b);
         } inline bool ge_dd_d(double_d a, double b) { return !lt_dd_d(a, b); }
-
         // \cond
     );
 // \endcond

From 83cbd33597e1136132c59b7360508bf0749e962c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= <tadej.c@gmail.com>
Date: Wed, 9 Jun 2021 14:49:22 +0200
Subject: [PATCH 04/15] nolint

---
 stan/math/opencl/double_d.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/stan/math/opencl/double_d.hpp b/stan/math/opencl/double_d.hpp
index 224b96a58ee..57693baf46f 100644
--- a/stan/math/opencl/double_d.hpp
+++ b/stan/math/opencl/double_d.hpp
@@ -275,7 +275,7 @@ const char* double_d_src = STRINGIFY(
           return !lt_d_dd(a, b);
         } inline bool ge_dd_d(double_d a, double b) { return !lt_dd_d(a, b); }
         // \cond
-    );
+    ); //NOLINT(whitespace/parens)
 // \endcond
 
 inline double_d operator+(double_d a, double b) { return add_dd_d(a, b); }

From 58dbeb0eb302e482b17ec304c16bcb03cf806cb9 Mon Sep 17 00:00:00 2001
From: Stan Jenkins <mc.stanislaw@gmail.com>
Date: Wed, 9 Jun 2021 12:55:08 +0000
Subject: [PATCH 05/15] [Jenkins] auto-formatting by clang-format version
 6.0.0-1ubuntu2~16.04.1 (tags/RELEASE_600/final)

---
 stan/math/opencl/double_d.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/stan/math/opencl/double_d.hpp b/stan/math/opencl/double_d.hpp
index 57693baf46f..f2011c85bb3 100644
--- a/stan/math/opencl/double_d.hpp
+++ b/stan/math/opencl/double_d.hpp
@@ -275,7 +275,7 @@ const char* double_d_src = STRINGIFY(
           return !lt_d_dd(a, b);
         } inline bool ge_dd_d(double_d a, double b) { return !lt_dd_d(a, b); }
         // \cond
-    ); //NOLINT(whitespace/parens)
+    );  // NOLINT(whitespace/parens)
 // \endcond
 
 inline double_d operator+(double_d a, double b) { return add_dd_d(a, b); }

From 79ed6663d01620d626554414ecbf481f8fd8b8e6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= <tadej.c@gmail.com>
Date: Wed, 9 Jun 2021 15:07:02 +0200
Subject: [PATCH 06/15] cpplint2

---
 test/unit/math/opencl/double_d_test.cpp | 38 ++++++++++++-------------
 1 file changed, 18 insertions(+), 20 deletions(-)

diff --git a/test/unit/math/opencl/double_d_test.cpp b/test/unit/math/opencl/double_d_test.cpp
index 98764b25194..fb7de3f87bf 100644
--- a/test/unit/math/opencl/double_d_test.cpp
+++ b/test/unit/math/opencl/double_d_test.cpp
@@ -1,6 +1,4 @@
-
 #include <stan/math/opencl/double_d.hpp>
-//#include <test/unit/math/expect_near_rel.hpp>
 #include <gtest/gtest.h>
 #include <limits>
 #include <Eigen/Core>
@@ -21,20 +19,20 @@ TEST(double_d, add_dd_dd_test) {
   // simple
   double_d res = add_dd_dd(a, a);
   EXPECT_NORMALIZED(res);
-  EXPECT_TRUE(res.high == 2.0);
-  EXPECT_TRUE(res.low == eps);
+  EXPECT_TRUE(res.high, 2.0);
+  EXPECT_TRUE(res.low, eps);
 
   // carry
   res = add_dd_dd(a, c);
   EXPECT_NORMALIZED(res);
-  EXPECT_TRUE(res.high == 1.0 + eps);
-  EXPECT_TRUE(res.low == 0.0);
+  EXPECT_TRUE(res.high, 1.0 + eps);
+  EXPECT_TRUE(res.low, 0.0);
 
   // cancelation
   res = add_dd_dd(a, d);
   EXPECT_NORMALIZED(res);
-  EXPECT_TRUE(res.high == eps * h_eps);
-  EXPECT_TRUE(res.low == 0.0);
+  EXPECT_TRUE(res.high, eps * h_eps);
+  EXPECT_TRUE(res.low, 0.0);
 }
 
 TEST(double_d, mul_dd_dd_test) {
@@ -50,20 +48,20 @@ TEST(double_d, mul_dd_dd_test) {
   // simple
   double_d res = mul_dd_dd(a, b);
   EXPECT_NORMALIZED(res);
-  EXPECT_TRUE(res.high == h_eps);
-  EXPECT_TRUE(res.low == h_eps * h_eps * 0.001);
+  EXPECT_TRUE(res.high, h_eps);
+  EXPECT_TRUE(res.low, h_eps * h_eps * 0.001);
 
   // carry
   res = mul_dd_dd(c, c);
   EXPECT_NORMALIZED(res);
-  EXPECT_TRUE(res.high == 1.0 + eps);
-  EXPECT_TRUE(res.low == 0.0);
+  EXPECT_TRUE(res.high, 1.0 + eps);
+  EXPECT_TRUE(res.low, 0.0);
 
   // cancelation
   res = mul_dd_dd(c, d);
   EXPECT_NORMALIZED(res);
-  EXPECT_TRUE(res.high == 1.0);
-  EXPECT_TRUE(res.low == 0.0);
+  EXPECT_TRUE(res.high, 1.0);
+  EXPECT_TRUE(res.low, 0.0);
 }
 
 TEST(double_d, div_dd_dd_test) {
@@ -79,18 +77,18 @@ TEST(double_d, div_dd_dd_test) {
   // simple
   double_d res = div_dd_dd(a, a);
   EXPECT_NORMALIZED(res);
-  EXPECT_TRUE(res.high == 1.0);
-  EXPECT_TRUE(res.low == 0.0);
+  EXPECT_TRUE(res.high, 1.0);
+  EXPECT_TRUE(res.low, 0.0);
 
   res = div_dd_dd(a, b);
   EXPECT_NORMALIZED(res);
-  EXPECT_TRUE(res.high == 1.0);
-  EXPECT_TRUE(res.low == h_eps * eps);
+  EXPECT_TRUE(res.high, 1.0);
+  EXPECT_TRUE(res.low, h_eps * eps);
 
   // div by zero
   res = div_dd_dd(a, c);
-  EXPECT_TRUE(res.high == std::numeric_limits<double>::infinity());
-  EXPECT_TRUE(res.low == 0);
+  EXPECT_TRUE(res.high, std::numeric_limits<double>::infinity());
+  EXPECT_TRUE(res.low, 0);
 }
 
 // this is the best test we have, but it relies on the nonstandard gcc extension

From 52803818972530a8cc0e25d781fb6ee5311546f8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= <tadej.c@gmail.com>
Date: Wed, 9 Jun 2021 15:13:29 +0200
Subject: [PATCH 07/15] typo

---
 test/unit/math/opencl/double_d_test.cpp | 36 ++++++++++++-------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/test/unit/math/opencl/double_d_test.cpp b/test/unit/math/opencl/double_d_test.cpp
index fb7de3f87bf..ffbad5e7ff2 100644
--- a/test/unit/math/opencl/double_d_test.cpp
+++ b/test/unit/math/opencl/double_d_test.cpp
@@ -19,20 +19,20 @@ TEST(double_d, add_dd_dd_test) {
   // simple
   double_d res = add_dd_dd(a, a);
   EXPECT_NORMALIZED(res);
-  EXPECT_TRUE(res.high, 2.0);
-  EXPECT_TRUE(res.low, eps);
+  EXPECT_EQ(res.high, 2.0);
+  EXPECT_EQ(res.low, eps);
 
   // carry
   res = add_dd_dd(a, c);
   EXPECT_NORMALIZED(res);
-  EXPECT_TRUE(res.high, 1.0 + eps);
-  EXPECT_TRUE(res.low, 0.0);
+  EXPECT_EQ(res.high, 1.0 + eps);
+  EXPECT_EQ(res.low, 0.0);
 
   // cancelation
   res = add_dd_dd(a, d);
   EXPECT_NORMALIZED(res);
-  EXPECT_TRUE(res.high, eps * h_eps);
-  EXPECT_TRUE(res.low, 0.0);
+  EXPECT_EQ(res.high, eps * h_eps);
+  EXPECT_EQ(res.low, 0.0);
 }
 
 TEST(double_d, mul_dd_dd_test) {
@@ -48,20 +48,20 @@ TEST(double_d, mul_dd_dd_test) {
   // simple
   double_d res = mul_dd_dd(a, b);
   EXPECT_NORMALIZED(res);
-  EXPECT_TRUE(res.high, h_eps);
-  EXPECT_TRUE(res.low, h_eps * h_eps * 0.001);
+  EXPECT_EQ(res.high, h_eps);
+  EXPECT_EQ(res.low, h_eps * h_eps * 0.001);
 
   // carry
   res = mul_dd_dd(c, c);
   EXPECT_NORMALIZED(res);
-  EXPECT_TRUE(res.high, 1.0 + eps);
-  EXPECT_TRUE(res.low, 0.0);
+  EXPECT_EQ(res.high, 1.0 + eps);
+  EXPECT_EQ(res.low, 0.0);
 
   // cancelation
   res = mul_dd_dd(c, d);
   EXPECT_NORMALIZED(res);
-  EXPECT_TRUE(res.high, 1.0);
-  EXPECT_TRUE(res.low, 0.0);
+  EXPECT_EQ(res.high, 1.0);
+  EXPECT_EQ(res.low, 0.0);
 }
 
 TEST(double_d, div_dd_dd_test) {
@@ -77,18 +77,18 @@ TEST(double_d, div_dd_dd_test) {
   // simple
   double_d res = div_dd_dd(a, a);
   EXPECT_NORMALIZED(res);
-  EXPECT_TRUE(res.high, 1.0);
-  EXPECT_TRUE(res.low, 0.0);
+  EXPECT_EQ(res.high, 1.0);
+  EXPECT_EQ(res.low, 0.0);
 
   res = div_dd_dd(a, b);
   EXPECT_NORMALIZED(res);
-  EXPECT_TRUE(res.high, 1.0);
-  EXPECT_TRUE(res.low, h_eps * eps);
+  EXPECT_EQ(res.high, 1.0);
+  EXPECT_EQ(res.low, h_eps * eps);
 
   // div by zero
   res = div_dd_dd(a, c);
-  EXPECT_TRUE(res.high, std::numeric_limits<double>::infinity());
-  EXPECT_TRUE(res.low, 0);
+  EXPECT_EQ(res.high, std::numeric_limits<double>::infinity());
+  EXPECT_EQ(res.low, 0);
 }
 
 // this is the best test we have, but it relies on the nonstandard gcc extension

From 604ce6ab0f4fc1ea3625d9c06103302c15f339d3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= <tadej.c@gmail.com>
Date: Wed, 16 Jun 2021 13:26:48 +0200
Subject: [PATCH 08/15] added opencl kernel for reverse mode of indexing

---
 stan/math/opencl/indexing_rev.hpp             |  46 ++++++
 .../device_functions/atomic_add_double.hpp    |  77 +++++++++
 stan/math/opencl/kernels/indexing_rev.hpp     | 148 ++++++++++++++++++
 test/unit/math/opencl/indexing_rev_test.cpp   |  50 ++++++
 4 files changed, 321 insertions(+)
 create mode 100644 stan/math/opencl/indexing_rev.hpp
 create mode 100644 stan/math/opencl/kernels/device_functions/atomic_add_double.hpp
 create mode 100644 stan/math/opencl/kernels/indexing_rev.hpp
 create mode 100644 test/unit/math/opencl/indexing_rev_test.cpp

diff --git a/stan/math/opencl/indexing_rev.hpp b/stan/math/opencl/indexing_rev.hpp
new file mode 100644
index 00000000000..2d99680455d
--- /dev/null
+++ b/stan/math/opencl/indexing_rev.hpp
@@ -0,0 +1,46 @@
+#ifndef STAN_MATH_OPENCL_INDEXING_REV_HPP
+#define STAN_MATH_OPENCL_INDEXING_REV_HPP
+#ifdef STAN_OPENCL
+
+#include <stan/math/opencl/matrix_cl.hpp>
+#include <stan/math/opencl/kernel_generator.hpp>
+#include <stan/math/opencl/kernels/indexing_rev.hpp>
+#include <stan/math/opencl/kernels/add.hpp>
+
+namespace stan {
+namespace math {
+
+void indexing_rev(matrix_cl<double>& adj, const matrix_cl<int>& idx,
+                  const matrix_cl<double>& res) {
+  int local_mem_size
+      = opencl_context.device()[0].getInfo<CL_DEVICE_LOCAL_MEM_SIZE>();
+  int preferred_work_groups
+      = opencl_context.device()[0].getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
+  int local_size = 64;
+  int n_threads = preferred_work_groups * 16 * local_size;
+
+  try {
+    if (local_mem_size > sizeof(double) * adj.size() * local_size * 2) {
+      stan::math::opencl_kernels::indexing_rev_local_independent(
+          cl::NDRange(n_threads), cl::NDRange(local_size), adj, idx, res,
+          cl::Local(sizeof(double) * adj.size() * local_size), res.size(),
+          adj.size());
+    } else if (local_mem_size > sizeof(double) * adj.size()) {
+      stan::math::opencl_kernels::indexing_rev_local_atomic(
+          cl::NDRange(n_threads), cl::NDRange(local_size), adj, idx, res,
+          cl::Local(sizeof(double) * adj.size()), res.size(), adj.size());
+    } else {
+      stan::math::opencl_kernels::indexing_rev_global_atomic(
+          cl::NDRange(n_threads), cl::NDRange(local_size), adj, idx, res,
+          res.size());
+    }
+  } catch (cl::Error& e) {
+    check_opencl_error("indexing reverse pass", e);
+  }
+}
+
+}  // namespace math
+}  // namespace stan
+
+#endif
+#endif
diff --git a/stan/math/opencl/kernels/device_functions/atomic_add_double.hpp b/stan/math/opencl/kernels/device_functions/atomic_add_double.hpp
new file mode 100644
index 00000000000..c17577edcca
--- /dev/null
+++ b/stan/math/opencl/kernels/device_functions/atomic_add_double.hpp
@@ -0,0 +1,77 @@
+#ifndef STAN_MATH_OPENCL_KERNELS_DEVICE_ATOMIC_ADD_DOUBLE_HPP
+#define STAN_MATH_OPENCL_KERNELS_DEVICE_ATOMIC_ADD_DOUBLE_HPP
+#ifdef STAN_OPENCL
+
+#include <stan/math/opencl/stringify.hpp>
+#include <string>
+
+namespace stan {
+namespace math {
+namespace opencl_kernels {
+// \cond
+static const char *atomic_add_double_device_function
+    = "\n"
+      "#ifndef STAN_MATH_OPENCL_KERNELS_DEVICE_FUNCTIONS_ATOMIC_ADD_DOUBLE\n"
+      "#define STAN_MATH_OPENCL_KERNELS_DEVICE_FUNCTIONS_ATOMIC_ADD_DOUBLE\n"
+      "#pragma OPENCL EXTENSION cl_khr_int64_base_atomics: enable\n" STRINGIFY(
+          // \endcond
+            /** \ingroup opencl_kernels
+             * Atomically add to a double value.
+             *
+             * Code is taken from:
+             * https://stackoverflow.com/questions/31863587/atomic-operations-with-double-opencl
+             *
+             * @param a pointer to value to add to
+             * @param b value to add
+             */
+            void atomic_add_double(__global double *val, double delta) {
+              union {
+                double f;
+                ulong i;
+              } old_val;
+              union {
+                double f;
+                ulong i;
+              } new_val;
+              do {
+                old_val.f = *val;
+                new_val.f = old_val.f + delta;
+              } while (atom_cmpxchg((volatile __global ulong *)val, old_val.i,
+                                    new_val.i)
+                       != old_val.i);
+            }
+            /** \ingroup opencl_kernels
+             * Atomically add to a local double value.
+             *
+             * Code is taken from:
+             * https://stackoverflow.com/questions/31863587/atomic-operations-with-double-opencl
+             *
+             * @param a pointer to value to add to
+             * @param b value to add
+             */
+            void local_atomic_add_double(__local double *val, double delta) {
+              union {
+                double f;
+                ulong i;
+              } old_val;
+              union {
+                double f;
+                ulong i;
+              } new_val;
+              do {
+                old_val.f = *val;
+                new_val.f = old_val.f + delta;
+              } while (atom_cmpxchg((volatile __local ulong *)val, old_val.i,
+                                    new_val.i)
+                       != old_val.i);
+            }
+          // \cond
+          ) "\n#endif\n";  // NOLINT
+// \endcond
+
+}  // namespace opencl_kernels
+}  // namespace math
+}  // namespace stan
+
+#endif
+#endif
diff --git a/stan/math/opencl/kernels/indexing_rev.hpp b/stan/math/opencl/kernels/indexing_rev.hpp
new file mode 100644
index 00000000000..c29149541fa
--- /dev/null
+++ b/stan/math/opencl/kernels/indexing_rev.hpp
@@ -0,0 +1,148 @@
+#ifndef STAN_MATH_OPENCL_KERNELS_INDEXING_REV_HPP
+#define STAN_MATH_OPENCL_KERNELS_INDEXING_REV_HPP
+#ifdef STAN_OPENCL
+
+#include <stan/math/opencl/kernel_cl.hpp>
+#include <stan/math/opencl/buffer_types.hpp>
+#include <stan/math/opencl/matrix_cl_view.hpp>
+#include <stan/math/opencl/kernels/device_functions/atomic_add_double.hpp>
+#include <string>
+
+namespace stan {
+namespace math {
+namespace opencl_kernels {
+
+// \cond
+static const std::string indexing_rev_global_atomic_kernel_code = STRINGIFY(
+    // \endcond
+    /** \ingroup opencl_kernels
+     *
+     * @param[in,out] adj adjoint to increment
+     * @param index
+     * @param res adjoint of the result of indexing
+     * @param batch_size Number of matrices in the batch.
+     * @note Code is a <code>const char*</code> held in
+     * <code>add_batch_kernel_code.</code>
+     */
+    __kernel void indexing_rev(__global double* adj, const __global int* index,
+                               const __global double* res, int size) {
+      const int gid = get_global_id(0);
+      const int gsize = get_global_size(0);
+      for (int i = gid; i < size; i += gsize) {
+        atomic_add_double(adj + index[i], res[i]);
+      }
+    }
+    // \cond
+);
+// \endcond
+
+/** \ingroup opencl_kernels
+ * See the docs for \link kernels/add.hpp add_batch() \endlink
+ */
+const kernel_cl<in_out_buffer, in_buffer, in_buffer, int>
+    indexing_rev_global_atomic("indexing_rev",
+                               {atomic_add_double_device_function,
+                                indexing_rev_global_atomic_kernel_code});
+
+// \cond
+static const std::string indexing_rev_local_atomic_kernel_code = STRINGIFY(
+    // \endcond
+    /** \ingroup opencl_kernels
+     *
+     * @param[in,out] adj adjoint to increment
+     * @param index
+     * @param res adjoint of the result of indexing
+     * @param batch_size Number of matrices in the batch.
+     * @note Code is a <code>const char*</code> held in
+     * <code>add_batch_kernel_code.</code>
+     */
+    __kernel void indexing_rev(__global double* adj, const __global int* index,
+                               const __global double* res,
+                               __local double* adj_loc, int index_size,
+                               int adj_size) {
+      const int gid = get_global_id(0);
+      const int lid = get_local_id(0);
+      const int gsize = get_global_size(0);
+      const int lsize = get_local_size(0);
+      for (int i = lid; i < adj_size; i += lsize) {
+        adj_loc[i] = 0;
+      }
+      barrier(CLK_LOCAL_MEM_FENCE);
+      for (int i = gid; i < index_size; i += gsize) {
+        local_atomic_add_double(adj_loc + index[i], res[i]);
+      }
+      barrier(CLK_LOCAL_MEM_FENCE);
+      for (int i = lid; i < adj_size; i += lsize) {
+        atomic_add_double(adj + i, adj_loc[i]);
+      }
+    }
+    // \cond
+);
+// \endcond
+
+/** \ingroup opencl_kernels
+ * See the docs for \link kernels/add.hpp add_batch() \endlink
+ */
+const kernel_cl<in_out_buffer, in_buffer, in_buffer, cl::LocalSpaceArg, int,
+                int>
+    indexing_rev_local_atomic("indexing_rev",
+                              {atomic_add_double_device_function,
+                               indexing_rev_local_atomic_kernel_code});
+
+// \cond
+static const std::string indexing_rev_local_independent_kernel_code = STRINGIFY(
+    // \endcond
+    /** \ingroup opencl_kernels
+     *
+     * @param[in,out] adj adjoint to increment
+     * @param index
+     * @param res adjoint of the result of indexing
+     * @param batch_size Number of matrices in the batch.
+     * @note Code is a <code>const char*</code> held in
+     * <code>add_batch_kernel_code.</code>
+     */
+    __kernel void indexing_rev(__global double* adj, const __global int* index,
+                               const __global double* res,
+                               __local double* adj_loc, int index_size,
+                               int adj_size) {
+      const int gid = get_global_id(0);
+      const int lid = get_local_id(0);
+      const int gsize = get_global_size(0);
+      const int lsize = get_local_size(0);
+      for (int i = lid; i < adj_size * lsize; i += lsize) {
+        adj_loc[i] = 0;
+      }
+      barrier(CLK_LOCAL_MEM_FENCE);
+      for (int i = gid; i < index_size; i += gsize) {
+        adj_loc[index[i] + lid * adj_size] += res[i];
+      }
+      barrier(CLK_LOCAL_MEM_FENCE);
+      for (int i = lid; i < adj_size; i += lsize) {
+        double p = adj_loc[i + adj_size];
+        for (int j = 2; j < lsize; j++) {
+          p += adj_loc[i + j * adj_size];
+        }
+        adj_loc[i] += p;
+      }
+      barrier(CLK_LOCAL_MEM_FENCE);
+      for (int i = lid; i < adj_size; i += lsize) {
+        atomic_add_double(adj + i, adj_loc[i]);
+      }
+    }
+    // \cond
+);
+// \endcond
+
+/** \ingroup opencl_kernels
+ * See the docs for \link kernels/add.hpp add_batch() \endlink
+ */
+const kernel_cl<in_out_buffer, in_buffer, in_buffer, cl::LocalSpaceArg, int,
+                int>
+    indexing_rev_local_independent(
+        "indexing_rev", {atomic_add_double_device_function,
+                         indexing_rev_local_independent_kernel_code});
+}  // namespace opencl_kernels
+}  // namespace math
+}  // namespace stan
+#endif
+#endif
diff --git a/test/unit/math/opencl/indexing_rev_test.cpp b/test/unit/math/opencl/indexing_rev_test.cpp
new file mode 100644
index 00000000000..d7f1c40400f
--- /dev/null
+++ b/test/unit/math/opencl/indexing_rev_test.cpp
@@ -0,0 +1,50 @@
+#ifdef STAN_OPENCL
+#include <stan/math/opencl/indexing_rev.hpp>
+#include <stan/math/opencl/copy.hpp>
+#include <test/unit/util.hpp>
+#include <test/unit/math/expect_near_rel.hpp>
+#include <gtest/gtest.h>
+#include <algorithm>
+
+TEST(indexing_rev, indexing_rev_small) {
+  Eigen::MatrixXd res(3, 3);
+  res << 1, 2, 3, 4, 5, 6, 7, 8, 9;
+  Eigen::MatrixXi idx(3, 3);
+  idx << 1, 2, 1, 0, 1, 2, 1, 1, 1;
+  Eigen::MatrixXd adj = Eigen::MatrixXd::Zero(3, 1);
+  Eigen::MatrixXd correct(3, 1);
+  correct << 4, 33, 8;
+  stan::math::matrix_cl<double> res_cl(res);
+  stan::math::matrix_cl<int> idx_cl(idx);
+  stan::math::matrix_cl<double> adj_cl(adj);
+
+  stan::math::indexing_rev(adj_cl, idx_cl, res_cl);
+  EXPECT_NEAR_REL(stan::math::from_matrix_cl(adj_cl), correct);
+}
+
+TEST(indexing_rev, indexing_rev_large) {
+  int N = 377;
+
+  // different sizes of adj ensure all three kernels are tested regardless of
+  // the OpenCL device
+  for (int M = 1; M < 1e6; M *= 2) {
+    Eigen::MatrixXd res = Eigen::MatrixXd::Random(N, N);
+    Eigen::MatrixXi idx(N, N);
+    for (int i = 0; i < N * N; i++) {
+      idx(i) = std::abs(Eigen::MatrixXi::Random(1, 1)(0)) % M;
+    }
+    Eigen::MatrixXd adj = Eigen::MatrixXd::Zero(M, 1);
+    Eigen::MatrixXd correct = adj;
+    for (int i = 0; i < N * N; i++) {
+      correct(idx(i)) += res(i);
+    }
+    stan::math::matrix_cl<double> res_cl(res);
+    stan::math::matrix_cl<int> idx_cl(idx);
+    stan::math::matrix_cl<double> adj_cl(adj);
+
+    stan::math::indexing_rev(adj_cl, idx_cl, res_cl);
+    EXPECT_NEAR_REL(stan::math::from_matrix_cl(adj_cl), correct);
+  }
+}
+
+#endif

From d3bb4b3a569ec339831a93e0bff6adf5786fc2da Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= <tadej.c@gmail.com>
Date: Mon, 21 Jun 2021 12:27:48 +0200
Subject: [PATCH 09/15] added is_rev_kernel_expression

---
 stan/math/prim/meta/is_kernel_expression.hpp | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/stan/math/prim/meta/is_kernel_expression.hpp b/stan/math/prim/meta/is_kernel_expression.hpp
index 8dde7061598..1063302f41a 100644
--- a/stan/math/prim/meta/is_kernel_expression.hpp
+++ b/stan/math/prim/meta/is_kernel_expression.hpp
@@ -76,15 +76,20 @@ struct is_kernel_expression_lhs
 template <typename T>
 struct is_kernel_expression_lhs<T, require_matrix_cl_t<T>> : std::true_type {};
 
+/**
+ * Determines whether a type is a var containing a kernel generator expression.
+ */
+template <typename T>
+struct is_rev_kernel_expression
+    : math::conjunction<is_var<T>, is_kernel_expression<value_type_t<T>>> {};
+
 /**
  * Determines whether a type is either a kernel generator
  * expression or a var containing a kernel generator expression.
  */
 template <typename T>
 struct is_prim_or_rev_kernel_expression
-    : math::disjunction<
-          is_kernel_expression<T>,
-          math::conjunction<is_var<T>, is_kernel_expression<value_type_t<T>>>> {
+    : math::disjunction<is_kernel_expression<T>, is_rev_kernel_expression<T>> {
 };
 
 /**
@@ -101,6 +106,8 @@ struct is_nonscalar_prim_or_rev_kernel_expression
 /** @}*/
 STAN_ADD_REQUIRE_UNARY(kernel_expression_lhs, is_kernel_expression_lhs,
                        opencl_kernel_generator);
+STAN_ADD_REQUIRE_UNARY(rev_kernel_expression, is_rev_kernel_expression,
+                       opencl_kernel_generator);
 STAN_ADD_REQUIRE_UNARY(prim_or_rev_kernel_expression,
                        is_prim_or_rev_kernel_expression,
                        opencl_kernel_generator);

From c4887becad0bdaf041237618c151cc72bd3ebd08 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= <tadej.c@gmail.com>
Date: Mon, 21 Jun 2021 12:27:48 +0200
Subject: [PATCH 10/15] added is_rev_kernel_expression

---
 stan/math/opencl/indexing_rev.hpp            |  9 +++++++++
 stan/math/opencl/kernels/indexing_rev.hpp    | 19 +++++++++++++++++++
 stan/math/prim/meta/is_kernel_expression.hpp | 13 ++++++++++---
 3 files changed, 38 insertions(+), 3 deletions(-)

diff --git a/stan/math/opencl/indexing_rev.hpp b/stan/math/opencl/indexing_rev.hpp
index 2d99680455d..94be70c2c68 100644
--- a/stan/math/opencl/indexing_rev.hpp
+++ b/stan/math/opencl/indexing_rev.hpp
@@ -10,6 +10,15 @@
 namespace stan {
 namespace math {
 
+/**
+ * Performs reverse pass for indexing operation on the OpenCL device. Depending
+ * on the size of indexed matrix and the amount of local memory available on the
+ * device selects the best kernel to use for the operation.
+ *
+ * @param[in,out] adj adjoint of the argument to indexing
+ * @param idx indices
+ * @param res adjoint of the result of the indexing operation
+ */
 void indexing_rev(matrix_cl<double>& adj, const matrix_cl<int>& idx,
                   const matrix_cl<double>& res) {
   int local_mem_size
diff --git a/stan/math/opencl/kernels/indexing_rev.hpp b/stan/math/opencl/kernels/indexing_rev.hpp
index c29149541fa..f4a601396e7 100644
--- a/stan/math/opencl/kernels/indexing_rev.hpp
+++ b/stan/math/opencl/kernels/indexing_rev.hpp
@@ -16,6 +16,12 @@ namespace opencl_kernels {
 static const std::string indexing_rev_global_atomic_kernel_code = STRINGIFY(
     // \endcond
     /** \ingroup opencl_kernels
+     *
+     * Increments adjoint of the indexing operation argument given the indices
+     * and adjoints of the indexing result.
+     *
+     * This kernel uses global atomics and is the fastest for large sizes of the
+     * indexed matrix.
      *
      * @param[in,out] adj adjoint to increment
      * @param index
@@ -48,6 +54,12 @@ const kernel_cl<in_out_buffer, in_buffer, in_buffer, int>
 static const std::string indexing_rev_local_atomic_kernel_code = STRINGIFY(
     // \endcond
     /** \ingroup opencl_kernels
+     *
+     * Increments adjoint of the indexing operation argument given the indices
+     * and adjoints of the indexing result.
+     *
+     * This kernel uses local atomics and is the fastest for medium sizes of the
+     * indexed matrix (and will not work for large sizes).
      *
      * @param[in,out] adj adjoint to increment
      * @param index
@@ -93,6 +105,13 @@ const kernel_cl<in_out_buffer, in_buffer, in_buffer, cl::LocalSpaceArg, int,
 static const std::string indexing_rev_local_independent_kernel_code = STRINGIFY(
     // \endcond
     /** \ingroup opencl_kernels
+     *
+     * Increments adjoint of the indexing operation argument given the indices
+     * and adjoints of the indexing result.
+     *
+     * This kernel makes each thread build its own copy of the adjoints before
+     * combining them. It is the fastest (and only works for) small size of the
+     * indexed matrix.
      *
      * @param[in,out] adj adjoint to increment
      * @param index
diff --git a/stan/math/prim/meta/is_kernel_expression.hpp b/stan/math/prim/meta/is_kernel_expression.hpp
index 8dde7061598..1063302f41a 100644
--- a/stan/math/prim/meta/is_kernel_expression.hpp
+++ b/stan/math/prim/meta/is_kernel_expression.hpp
@@ -76,15 +76,20 @@ struct is_kernel_expression_lhs
 template <typename T>
 struct is_kernel_expression_lhs<T, require_matrix_cl_t<T>> : std::true_type {};
 
+/**
+ * Determines whether a type is a var containing a kernel generator expression.
+ */
+template <typename T>
+struct is_rev_kernel_expression
+    : math::conjunction<is_var<T>, is_kernel_expression<value_type_t<T>>> {};
+
 /**
  * Determines whether a type is either a kernel generator
  * expression or a var containing a kernel generator expression.
  */
 template <typename T>
 struct is_prim_or_rev_kernel_expression
-    : math::disjunction<
-          is_kernel_expression<T>,
-          math::conjunction<is_var<T>, is_kernel_expression<value_type_t<T>>>> {
+    : math::disjunction<is_kernel_expression<T>, is_rev_kernel_expression<T>> {
 };
 
 /**
@@ -101,6 +106,8 @@ struct is_nonscalar_prim_or_rev_kernel_expression
 /** @}*/
 STAN_ADD_REQUIRE_UNARY(kernel_expression_lhs, is_kernel_expression_lhs,
                        opencl_kernel_generator);
+STAN_ADD_REQUIRE_UNARY(rev_kernel_expression, is_rev_kernel_expression,
+                       opencl_kernel_generator);
 STAN_ADD_REQUIRE_UNARY(prim_or_rev_kernel_expression,
                        is_prim_or_rev_kernel_expression,
                        opencl_kernel_generator);

From c76cfa6d91ecf285204b3b131d1cf3205aece465 Mon Sep 17 00:00:00 2001
From: Stan Jenkins <mc.stanislaw@gmail.com>
Date: Mon, 21 Jun 2021 11:22:26 +0000
Subject: [PATCH 11/15] [Jenkins] auto-formatting by clang-format version
 6.0.0-1ubuntu2~16.04.1 (tags/RELEASE_600/final)

---
 .../device_functions/atomic_add_double.hpp    | 100 +++++++++---------
 1 file changed, 50 insertions(+), 50 deletions(-)

diff --git a/stan/math/opencl/kernels/device_functions/atomic_add_double.hpp b/stan/math/opencl/kernels/device_functions/atomic_add_double.hpp
index c17577edcca..8ae6977ba6b 100644
--- a/stan/math/opencl/kernels/device_functions/atomic_add_double.hpp
+++ b/stan/math/opencl/kernels/device_functions/atomic_add_double.hpp
@@ -15,56 +15,56 @@ static const char *atomic_add_double_device_function
       "#define STAN_MATH_OPENCL_KERNELS_DEVICE_FUNCTIONS_ATOMIC_ADD_DOUBLE\n"
       "#pragma OPENCL EXTENSION cl_khr_int64_base_atomics: enable\n" STRINGIFY(
           // \endcond
-            /** \ingroup opencl_kernels
-             * Atomically add to a double value.
-             *
-             * Code is taken from:
-             * https://stackoverflow.com/questions/31863587/atomic-operations-with-double-opencl
-             *
-             * @param a pointer to value to add to
-             * @param b value to add
-             */
-            void atomic_add_double(__global double *val, double delta) {
-              union {
-                double f;
-                ulong i;
-              } old_val;
-              union {
-                double f;
-                ulong i;
-              } new_val;
-              do {
-                old_val.f = *val;
-                new_val.f = old_val.f + delta;
-              } while (atom_cmpxchg((volatile __global ulong *)val, old_val.i,
-                                    new_val.i)
-                       != old_val.i);
-            }
-            /** \ingroup opencl_kernels
-             * Atomically add to a local double value.
-             *
-             * Code is taken from:
-             * https://stackoverflow.com/questions/31863587/atomic-operations-with-double-opencl
-             *
-             * @param a pointer to value to add to
-             * @param b value to add
-             */
-            void local_atomic_add_double(__local double *val, double delta) {
-              union {
-                double f;
-                ulong i;
-              } old_val;
-              union {
-                double f;
-                ulong i;
-              } new_val;
-              do {
-                old_val.f = *val;
-                new_val.f = old_val.f + delta;
-              } while (atom_cmpxchg((volatile __local ulong *)val, old_val.i,
-                                    new_val.i)
-                       != old_val.i);
-            }
+          /** \ingroup opencl_kernels
+           * Atomically add to a double value.
+           *
+           * Code is taken from:
+           * https://stackoverflow.com/questions/31863587/atomic-operations-with-double-opencl
+           *
+           * @param a pointer to value to add to
+           * @param b value to add
+           */
+          void atomic_add_double(__global double *val, double delta) {
+            union {
+              double f;
+              ulong i;
+            } old_val;
+            union {
+              double f;
+              ulong i;
+            } new_val;
+            do {
+              old_val.f = *val;
+              new_val.f = old_val.f + delta;
+            } while (atom_cmpxchg((volatile __global ulong *)val, old_val.i,
+                                  new_val.i)
+                     != old_val.i);
+          }
+          /** \ingroup opencl_kernels
+           * Atomically add to a local double value.
+           *
+           * Code is taken from:
+           * https://stackoverflow.com/questions/31863587/atomic-operations-with-double-opencl
+           *
+           * @param a pointer to value to add to
+           * @param b value to add
+           */
+          void local_atomic_add_double(__local double *val, double delta) {
+            union {
+              double f;
+              ulong i;
+            } old_val;
+            union {
+              double f;
+              ulong i;
+            } new_val;
+            do {
+              old_val.f = *val;
+              new_val.f = old_val.f + delta;
+            } while (atom_cmpxchg((volatile __local ulong *)val, old_val.i,
+                                  new_val.i)
+                     != old_val.i);
+          }
           // \cond
           ) "\n#endif\n";  // NOLINT
 // \endcond

From a5fe75263881ca83948a1f5d7f25667f552598f4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= <tadej.c@gmail.com>
Date: Mon, 21 Jun 2021 16:13:05 +0200
Subject: [PATCH 12/15] fixed fma multiplication

---
 stan/math/opencl/double_d.hpp | 90 ++++++++++++++++++++---------------
 1 file changed, 52 insertions(+), 38 deletions(-)

diff --git a/stan/math/opencl/double_d.hpp b/stan/math/opencl/double_d.hpp
index f2011c85bb3..99dcba23a1d 100644
--- a/stan/math/opencl/double_d.hpp
+++ b/stan/math/opencl/double_d.hpp
@@ -44,6 +44,47 @@ using std::isinf;
 using std::isnan;
 using std::ldexp;
 
+inline double_d mul_d_d(double a, double b) {
+  double_d res;
+  // for some reason the std::fma only gives accurate result if compiling for
+  // instruction set that includes a fma instruction
+#if defined(__FMA__)
+  res.high = a * b;
+  if (isfinite(res.high)) {
+    res.low = fma(a, b, -res.high);
+  } else {
+    res.low = 0;
+  }
+#else
+  const double C = (2 << 26) + 1;
+  double p = a * C;
+  if (!isfinite(p)) {
+    res.high = p;
+    res.low = 0;
+    return res;
+  }
+  double hx = a - p + p;
+  double tx = a - hx;
+  p = b * C;
+  if (!isfinite(p)) {
+    res.high = p;
+    res.low = 0;
+    return res;
+  }
+  double hy = b - p + p;
+  double ty = b - hy;
+  p = hx * hy;
+  double q = hx * ty + tx * hy;
+  res.high = p + q;
+  if (isfinite(res.high)) {
+    res.low = p - res.high + q + tx * ty;
+  } else {
+    res.low = 0;
+  }
+#endif
+  return res;
+}
+
 // \cond
 const char* double_d_src = STRINGIFY(
     // \endcond
@@ -51,49 +92,22 @@ const char* double_d_src = STRINGIFY(
       double high;
       double low;
     } double_d;
+
+    inline double_d mul_d_d(double a, double b) {
+      double_d res;
+      res.high = a * b;
+      if (isfinite(res.high)) {
+        res.low = fma(a, b, -res.high);
+      } else {
+        res.low = 0;
+      }
+      return res;
+    }
     // \cond
     )
     STRINGIFY2(
         // \endcond
 
-        //    inline double_d mul_d_d(double a, double b) {
-        //      double_d res;
-        //      volatile double temp = a * b;
-        //      res.high = temp;
-        //      res.low = fma(a, b, -temp);
-        //      return res;
-        //    }
-
-        inline double_d mul_d_d(double a, double b) {
-          double_d res;
-          const double C = (2 << 26) + 1;
-          double p = a * C;
-          if (!isfinite(p)) {
-            res.high = p;
-            res.low = 0;
-            return res;
-          }
-          double hx = a - p + p;
-          double tx = a - hx;
-          p = b * C;
-          if (!isfinite(p)) {
-            res.high = p;
-            res.low = 0;
-            return res;
-          }
-          double hy = b - p + p;
-          double ty = b - hy;
-          p = hx * hy;
-          double q = hx * ty + tx * hy;
-          res.high = p + q;
-          if (isfinite(res.high)) {
-            res.low = p - res.high + q + tx * ty;
-          } else {
-            res.low = 0;
-          }
-          return res;
-        }
-
         inline double_d neg(double_d a) {
           double_d res;
           res.high = -a.high;

From f085f04cc13c1ccdbe4cd30082512db9c97205fe Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= <tadej.c@gmail.com>
Date: Tue, 22 Jun 2021 11:42:41 +0200
Subject: [PATCH 13/15] added indexing to opencl vari

---
 stan/math/opencl/rev/vari.hpp | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/stan/math/opencl/rev/vari.hpp b/stan/math/opencl/rev/vari.hpp
index 5f5ef03165c..cd35d14c0f0 100644
--- a/stan/math/opencl/rev/vari.hpp
+++ b/stan/math/opencl/rev/vari.hpp
@@ -108,6 +108,27 @@ class vari_cl_base : public vari_base {
                                                     std::move(adj_t));
   }
 
+  /**
+   * Return indexed view into a matrix.
+   *
+   * Do not use with indices that reference any element of this more than once -
+   * that cans cause data races in rev operations on the result!
+   *
+   * @param row_index kg expression used for row index
+   * @param col_index kg expression used for column index
+   */
+  template <typename RowIndex, typename ColIndex>
+  auto index(const RowIndex& row_index, const ColIndex& col_index) {
+    RowIndex r1 = row_index;
+    RowIndex r2 = row_index;
+    ColIndex c1 = col_index;
+    ColIndex c2 = col_index;
+    auto&& val_t = stan::math::indexing(val_, std::move(r1), std::move(c1));
+    auto&& adj_t = stan::math::indexing(adj_, std::move(r2), std::move(c2));
+    return vari_view<std::decay_t<decltype(val_t)>>(std::move(val_t),
+                                                    std::move(adj_t));
+  }
+
   /**
    * Return the number of rows for this class's `val_` member
    */

From 64a557d34cb7a4032afc39fbdd0b503d1cbdb2b9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= <tadej.c@gmail.com>
Date: Thu, 24 Jun 2021 07:33:10 +0200
Subject: [PATCH 14/15] fix val_op

---
 stan/math/rev/core/var.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/stan/math/rev/core/var.hpp b/stan/math/rev/core/var.hpp
index 26c78556d5b..0c7ade74951 100644
--- a/stan/math/rev/core/var.hpp
+++ b/stan/math/rev/core/var.hpp
@@ -401,7 +401,7 @@ class var_value<
    * @return The value of this variable.
    */
   inline const auto& val() const { return vi_->val(); }
-  inline auto& val_op() { return vi_->val(); }
+  inline auto& val_op() { return vi_->val_op(); }
 
   /**
    * Return a reference to the derivative of the root expression with

From 6d39b25198ae0faf3b9e81cb83c156b969e3a2da Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= <tadej.c@gmail.com>
Date: Mon, 28 Jun 2021 14:11:25 +0200
Subject: [PATCH 15/15] fix

---
 benchmarks/benchmark.py | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/benchmarks/benchmark.py b/benchmarks/benchmark.py
index 7d3aacb6581..f6e1158277e 100755
--- a/benchmarks/benchmark.py
+++ b/benchmarks/benchmark.py
@@ -424,13 +424,16 @@ def benchmark(
                             special_arg_values[function_name][n], numbers.Number
                     ):
                         value = special_arg_values[function_name][n]
-                if scalar == "double":
+                if not is_argument_autodiff or (
+                      not is_argument_scalar and (opencl == "base" or varmat == "base")
+				):
+                    arg_type_prim = cpp_arg_template.replace("SCALAR", "double");
                     setup += (
                         "  {} {} = stan::test::{}<{}>({}, state.range(0));\n".format(
-                            arg_type,
+                            arg_type_prim,
                             var_name,
                             make_arg_function,
-                            arg_type,
+                            arg_type_prim,
                             value,
                         )
                     )
@@ -440,9 +443,15 @@ def benchmark(
                                 var_name + "_cl", var_name
                             )
                             var_name += "_cl"
+                            if is_argument_autodiff:
+                                var_conversions += (
+                                  "    stan::math::var_value<stan::math::matrix_cl<double>> {}({});\n".format(
+                                    var_name + "_var", var_name)
+                                )
+                                var_name += "_var"
                         elif varmat == "base" and arg_overload == "Rev":
-                            setup += "  auto {} = stan::math::to_var_value({});\n".format(
-                                var_name + "_varmat", var_name
+                            var_conversions += "      stan::math::var_value<{}> {}({});\n".format(
+                                arg_type_prim, var_name + "_varmat", var_name
                             )
                             var_name += "_varmat"
                 else: