From 3c26b49899e47bf033ca958cf8473b55c4e364d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= Date: Tue, 8 Jun 2021 10:22:22 +0200 Subject: [PATCH 01/15] double double --- stan/math/opencl/copy.hpp | 2 +- stan/math/opencl/double_d.hpp | 322 ++++++++++++++++++ .../kernel_generator/matrix_cl_conversion.hpp | 4 +- stan/math/opencl/matrix_cl.hpp | 11 +- stan/math/opencl/zeros_strict_tri.hpp | 2 +- test/unit/math/opencl/double_d_test.cpp | 179 ++++++++++ 6 files changed, 507 insertions(+), 13 deletions(-) create mode 100644 stan/math/opencl/double_d.hpp create mode 100644 test/unit/math/opencl/double_d_test.cpp diff --git a/stan/math/opencl/copy.hpp b/stan/math/opencl/copy.hpp index 574d1734b0a..62dd7915879 100644 --- a/stan/math/opencl/copy.hpp +++ b/stan/math/opencl/copy.hpp @@ -56,7 +56,7 @@ inline matrix_cl> to_matrix_cl(T&& src) { * @return Eigen matrix with a copy of the data in the source matrix */ template * = nullptr, + require_eigen_t* = nullptr, require_matrix_cl_t* = nullptr, require_st_same* = nullptr> inline auto from_matrix_cl(const T& src) { diff --git a/stan/math/opencl/double_d.hpp b/stan/math/opencl/double_d.hpp new file mode 100644 index 00000000000..d2587801246 --- /dev/null +++ b/stan/math/opencl/double_d.hpp @@ -0,0 +1,322 @@ +#ifndef STAN_MATH_OPENCL_DOUBLE_D_HPP +#define STAN_MATH_OPENCL_DOUBLE_D_HPP + +#include + +#include +#include +#include +#include + +/** + * Alternative stringify, that also exposes the code for use in C++ host code. + */ +#define STRINGIFY2(A) \ +#A; \ + A +namespace stan { +namespace math { +namespace internal { + +/** + * Double double - a 128 bit floating point number defined as an exact sum of 2 + * doubles. `low` should be less than 1 ulp of `high`. + */ +typedef struct double_d { + double high; + double low; + + double_d() {} + double_d(double a) { + high = a; + low = 0; + } + double_d(double h, double l) { + high = h; + low = l; + } +} double_d; + +using std::copysign; +using std::fabs; +using std::isfinite; +using std::isinf; +using std::isnan; +using std::ldexp; + +// \cond +const char* double_d_src = STRINGIFY( + // \endcond + typedef struct { + double high; + double low; + } double_d; + // \cond + ) + STRINGIFY2( + // \endcond + + // inline double_d mul_d_d(double a, double b) { + // double_d res; + // volatile double temp = a * b; + // res.high = temp; + // res.low = fma(a, b, -temp); + // return res; + // } + + inline double_d mul_d_d(double a, double b) { + double_d res; + const double C = (2 << 26) + 1; + double p = a * C; + if (!isfinite(p)) { + res.high = p; + res.low = 0; + return res; + } + double hx = a - p + p; + double tx = a - hx; + p = b * C; + if (!isfinite(p)) { + res.high = p; + res.low = 0; + return res; + } + double hy = b - p + p; + double ty = b - hy; + p = hx * hy; + double q = hx * ty + tx * hy; + res.high = p + q; + if (isfinite(res.high)) { + res.low = p - res.high + q + tx * ty; + } else { + res.low = 0; + } + return res; + } + + inline double_d neg(double_d a) { + double_d res; + res.high = -a.high; + res.low = -a.low; + return res; + } + + inline double_d add_dd_dd(double_d a, double_d b) { + double_d res; + double high = a.high + b.high; + if (isfinite(high)) { + double low; + if (fabs(a.high) > fabs(b.high)) { + low = a.high - high + b.high; + } else { + low = b.high - high + a.high; + } + low += a.low + b.low; + res.high = high + low; + res.low = high - res.high + low; + } else { + res.high = high; + res.low = 0; + } + return res; + } + + inline double_d add_dd_d(double_d a, double b) { + double_d res; + double high = a.high + b; + if (isfinite(high)) { + double low; + if (fabs(a.high) > fabs(b)) { + low = a.high - high + b; + } else { + low = b - high + a.high; + } + low += a.low; + res.high = high + low; + res.low = high - res.high + low; + } else { + res.high = high; + res.low = 0; + } + return res; + } + + inline double_d sub_dd_dd(double_d a, double_d b) { + return add_dd_dd(a, neg(b)); + } + + inline double_d sub_dd_d(double_d a, double b) { + return add_dd_d(a, -b); + } + + inline double_d sub_d_dd(double a, double_d b) { + return add_dd_d(neg(b), a); + } + + inline double_d mul_dd_dd(double_d a, double_d b) { + double_d high = mul_d_d(a.high, b.high); + double_d res; + if (isfinite(high.high)) { + high.low += a.high * b.low + a.low * b.high; + res.high = high.high + high.low; + res.low = high.high - res.high + high.low; + } else { + high.low = 0; + return high; + } + return res; + } + + inline double_d mul_dd_d(double_d a, double b) { + double_d high = mul_d_d(a.high, b); + double_d res; + if (isfinite(high.high)) { + high.low += a.low * b; + res.high = high.high + high.low; + res.low = high.high - res.high + high.low; + return res; + } else { + high.low = 0; + return high; + } + } + + inline double_d div_dd_dd(double_d a, double_d b) { + double high = a.high / b.high; + double_d res; + if (isfinite(high)) { + double_d u = mul_d_d(high, b.high); + if (isfinite(u.high)) { + double low + = (a.high - u.high - u.low + a.low - high * b.low) / b.high; + res.high = high + low; + res.low = high - res.high + low; + return res; + } + } + res.high = high; + res.low = 0; + return res; + } + + inline double_d div_dd_d(double_d a, double b) { + double high = a.high / b; + double_d res; + if (isfinite(high)) { + double_d u = mul_d_d(high, b); + double low = (a.high - u.high - u.low + a.low) / b; + res.high = high + low; + res.low = high - res.high + low; + } else { + res.high = high; + res.low = 0; + } + return res; + } + + inline double_d div_d_dd(double a, double_d b) { + double high = a / b.high; + double_d res; + if (isfinite(high)) { + double_d u = mul_d_d(high, b.high); + double low = (a - u.high - u.low - high * b.low) / b.high; + res.high = high + low; + res.low = high - res.high + low; + } else { + res.high = high; + res.low = 0; + } + return res; + } + + inline double copysign_d_dd(double a, double_d b) { + return copysign(a, b.high); + } + + inline double_d abs_dd(double_d a) { return a.high > 0 ? a : neg(a); } + + inline bool isnan_dd(double_d a) { return isnan(a.high); } + + inline bool isinf_dd(double_d a) { return isinf(a.high); } + + inline bool lt_dd_dd(double_d a, double_d b) { + return a.high < b.high || (a.high == b.high && a.low < b.low); + } + + inline bool lt_d_dd(double a, double_d b) { + return lt_dd_dd((double_d){a, 0}, b); + } + + inline bool lt_dd_d(double_d a, double b) { + return lt_dd_dd(a, (double_d){b, 0}); + } + + inline bool gt_dd_dd(double_d a, double_d b) { + return a.high > b.high || (a.high == b.high && a.low > b.low); + } + + inline bool gt_d_dd(double a, double_d b) { + return gt_dd_dd((double_d){a, 0}, b); + } + + inline bool gt_dd_d(double_d a, double b) { + return gt_dd_dd(a, (double_d){b, 0}); + } + + inline bool le_dd_dd(double_d a, double_d b) { + return !gt_dd_dd(a, b); + } inline bool le_d_dd(double a, double_d b) { + return !gt_d_dd(a, b); + } inline bool le_dd_d(double_d a, double b) { return !gt_dd_d(a, b); } + + inline bool ge_dd_dd(double_d a, double_d b) { + return !lt_dd_dd(a, b); + } inline bool ge_d_dd(double a, double_d b) { + return !lt_d_dd(a, b); + } inline bool ge_dd_d(double_d a, double b) { return !lt_dd_d(a, b); } + + // \cond + ); +// \endcond + +inline double_d operator+(double_d a, double b) { return add_dd_d(a, b); } +inline double_d operator+(double a, double_d b) { return add_dd_d(b, a); } +inline double_d operator+(double_d a, double_d b) { return add_dd_dd(a, b); } + +inline double_d operator-(double a, double_d b) { return sub_d_dd(a, b); } +inline double_d operator-(double_d a, double b) { return sub_dd_d(a, b); } +inline double_d operator-(double_d a, double_d b) { return sub_dd_dd(a, b); } + +inline double_d operator*(double_d a, double b) { return mul_dd_d(a, b); } +inline double_d operator*(double a, double_d b) { return mul_dd_d(b, a); } +inline double_d operator*(double_d a, double_d b) { return mul_dd_dd(a, b); } + +inline double_d operator/(double_d a, double b) { return div_dd_d(a, b); } +inline double_d operator/(double a, double_d b) { return div_d_dd(a, b); } +inline double_d operator/(double_d a, double_d b) { return div_dd_dd(a, b); } + +inline double_d operator-(double_d a) { return neg(a); } + +inline bool operator<(double_d a, double_d b) { return lt_dd_dd(a, b); } +inline bool operator<(double a, double_d b) { return lt_d_dd(a, b); } +inline bool operator<(double_d a, double b) { return lt_dd_d(a, b); } +inline bool operator>(double_d a, double_d b) { return gt_dd_dd(a, b); } +inline bool operator>(double a, double_d b) { return gt_d_dd(a, b); } +inline bool operator>(double_d a, double b) { return gt_dd_d(a, b); } +inline bool operator<=(double_d a, double_d b) { return le_dd_dd(a, b); } +inline bool operator<=(double a, double_d b) { return le_d_dd(a, b); } +inline bool operator<=(double_d a, double b) { return le_dd_d(a, b); } +inline bool operator>=(double_d a, double_d b) { return ge_dd_dd(a, b); } +inline bool operator>=(double a, double_d b) { return ge_d_dd(a, b); } +inline bool operator>=(double_d a, double b) { return ge_dd_d(a, b); } + +inline double_d fabs(double_d a) { return abs_dd(a); } +inline bool isnan(double_d a) { return isnan_dd(a); } +inline bool isinf(double_d a) { return isinf_dd(a); } +inline double copysign(double a, double_d b) { return copysign_d_dd(a, b); } + +} // namespace internal +} // namespace math +} // namespace stan + +#endif diff --git a/stan/math/opencl/kernel_generator/matrix_cl_conversion.hpp b/stan/math/opencl/kernel_generator/matrix_cl_conversion.hpp index c91e6a62a05..02bdbde82d1 100644 --- a/stan/math/opencl/kernel_generator/matrix_cl_conversion.hpp +++ b/stan/math/opencl/kernel_generator/matrix_cl_conversion.hpp @@ -14,7 +14,7 @@ namespace math { template template *> -matrix_cl>::matrix_cl(const Expr& expresion) +matrix_cl::matrix_cl(const Expr& expresion) : rows_(0), cols_(0) { results(*this) = expressions(expresion); } @@ -22,7 +22,7 @@ matrix_cl>::matrix_cl(const Expr& expresion) template template *> -matrix_cl& matrix_cl>::operator=( +matrix_cl& matrix_cl::operator=( const Expr& expresion) { results(*this) = expressions(expresion); return *this; diff --git a/stan/math/opencl/matrix_cl.hpp b/stan/math/opencl/matrix_cl.hpp index e4949d69761..b12cd6bd4f4 100644 --- a/stan/math/opencl/matrix_cl.hpp +++ b/stan/math/opencl/matrix_cl.hpp @@ -31,7 +31,7 @@ namespace math { * @{ */ -template +template class matrix_cl; /** @@ -39,7 +39,7 @@ class matrix_cl; * @tparam T an arithmetic type for the type stored in the OpenCL buffer. */ template -class matrix_cl> : public matrix_cl_base { +class matrix_cl : public matrix_cl_base { private: cl::Buffer buffer_cl_; // Holds the allocated memory on the device int rows_{0}; // Number of rows. @@ -624,13 +624,6 @@ class matrix_cl> : public matrix_cl_base { delete static_cast(container); } }; - -template -using matrix_cl_prim = matrix_cl>; - -template -using matrix_cl_fp = matrix_cl>; - /** @}*/ } // namespace math diff --git a/stan/math/opencl/zeros_strict_tri.hpp b/stan/math/opencl/zeros_strict_tri.hpp index 66ff85c64f4..30770a7aef9 100644 --- a/stan/math/opencl/zeros_strict_tri.hpp +++ b/stan/math/opencl/zeros_strict_tri.hpp @@ -29,7 +29,7 @@ namespace math { */ template template -inline void matrix_cl>::zeros_strict_tri() try { +inline void matrix_cl::zeros_strict_tri() try { if (matrix_view == matrix_cl_view::Entire) { invalid_argument( "zeros_strict_tri", "matrix_view", diff --git a/test/unit/math/opencl/double_d_test.cpp b/test/unit/math/opencl/double_d_test.cpp new file mode 100644 index 00000000000..98764b25194 --- /dev/null +++ b/test/unit/math/opencl/double_d_test.cpp @@ -0,0 +1,179 @@ + +#include +//#include +#include +#include +#include + +#define EXPECT_NORMALIZED(a) \ + EXPECT_LT(std::abs(a.low), \ + std::abs(a.high) * std::numeric_limits::epsilon()); + +TEST(double_d, add_dd_dd_test) { + using stan::math::internal::add_dd_dd; + using stan::math::internal::double_d; + double eps = std::numeric_limits::epsilon(); + double h_eps = eps * 0.5; + double_d a{1.0, h_eps}; + double_d c{h_eps, 0.0}; + double_d d{-1.0, -h_eps + eps * h_eps}; + + // simple + double_d res = add_dd_dd(a, a); + EXPECT_NORMALIZED(res); + EXPECT_TRUE(res.high == 2.0); + EXPECT_TRUE(res.low == eps); + + // carry + res = add_dd_dd(a, c); + EXPECT_NORMALIZED(res); + EXPECT_TRUE(res.high == 1.0 + eps); + EXPECT_TRUE(res.low == 0.0); + + // cancelation + res = add_dd_dd(a, d); + EXPECT_NORMALIZED(res); + EXPECT_TRUE(res.high == eps * h_eps); + EXPECT_TRUE(res.low == 0.0); +} + +TEST(double_d, mul_dd_dd_test) { + using stan::math::internal::double_d; + using stan::math::internal::mul_dd_dd; + double eps = std::numeric_limits::epsilon(); + double h_eps = eps * 0.5; + double_d a{1.0, h_eps * 0.001}; + double_d b{h_eps, h_eps * h_eps * h_eps * h_eps}; + double_d c{1.0, h_eps}; + double_d d{1.0, -h_eps}; + + // simple + double_d res = mul_dd_dd(a, b); + EXPECT_NORMALIZED(res); + EXPECT_TRUE(res.high == h_eps); + EXPECT_TRUE(res.low == h_eps * h_eps * 0.001); + + // carry + res = mul_dd_dd(c, c); + EXPECT_NORMALIZED(res); + EXPECT_TRUE(res.high == 1.0 + eps); + EXPECT_TRUE(res.low == 0.0); + + // cancelation + res = mul_dd_dd(c, d); + EXPECT_NORMALIZED(res); + EXPECT_TRUE(res.high == 1.0); + EXPECT_TRUE(res.low == 0.0); +} + +TEST(double_d, div_dd_dd_test) { + using stan::math::internal::div_dd_dd; + using stan::math::internal::double_d; + double eps = std::numeric_limits::epsilon(); + double h_eps = eps * 0.5; + double_d a{1.0, h_eps}; + double_d b{1.0, h_eps * (1 - eps)}; + double_d c{0.0, 0.0}; + double_d d{1.0, -h_eps}; + + // simple + double_d res = div_dd_dd(a, a); + EXPECT_NORMALIZED(res); + EXPECT_TRUE(res.high == 1.0); + EXPECT_TRUE(res.low == 0.0); + + res = div_dd_dd(a, b); + EXPECT_NORMALIZED(res); + EXPECT_TRUE(res.high == 1.0); + EXPECT_TRUE(res.low == h_eps * eps); + + // div by zero + res = div_dd_dd(a, c); + EXPECT_TRUE(res.high == std::numeric_limits::infinity()); + EXPECT_TRUE(res.low == 0); +} + +// this is the best test we have, but it relies on the nonstandard gcc extension +// quadmath, so it is not run by default +#ifdef GCC_QUADMATH_TEST +#include + +#define EXPECT_DD_F128_EQ(dd, f128) \ + tmp = (dd); \ + EXPECT_NEAR((__float128)tmp.high + tmp.low, (f128), 1e-30 * 1e30) + +TEST(double_d, all) { + using stan::math::internal::double_d; + for (int i = 0; i < 10000000; i++) { + double_d tmp; + double high = Eigen::MatrixXd::Random(0, 0).coeff(0, 0) * 1e30; + double low = Eigen::MatrixXd::Random(0, 0).coeff(0, 0) * 1e-20 * 1e30; + double_d dd = high; + dd.low = low; + __float128 f128 = high; + f128 += low; + EXPECT_DD_F128_EQ(dd, f128); + + double high2 = Eigen::MatrixXd::Random(0, 0).coeff(0, 0); + double low2 = Eigen::MatrixXd::Random(0, 0).coeff(0, 0) * 1e-20; + double_d dd2 = high2; + dd2.low = low2; + __float128 f1282 = high2; + f1282 += low2; + EXPECT_DD_F128_EQ(dd2, f1282); + + EXPECT_DD_F128_EQ(dd + dd2, f128 + f1282); + EXPECT_DD_F128_EQ(dd - dd2, f128 - f1282); + EXPECT_DD_F128_EQ(dd * dd2, f128 * f1282); + EXPECT_DD_F128_EQ(dd / dd2, f128 / f1282); + + __float128 f1283 = high; + __float128 f1284 = high2; + EXPECT_DD_F128_EQ(stan::math::internal::mul_d_d(high, high2), + f1283 * f1284); + } +} +#endif + +#ifdef STAN_OPENCL + +#include +static const std::string double_d_test_kernel_code + = STRINGIFY(__kernel void double_d_test_kernel(__global double_d *C, + const __global double_d *A, + const __global double *B) { + const int i = get_global_id(0); + C[i] = mul_dd_d(A[i], B[i]); + }); + +const stan::math::opencl_kernels::kernel_cl< + stan::math::opencl_kernels::out_buffer, + stan::math::opencl_kernels::in_buffer, + stan::math::opencl_kernels::in_buffer> + double_d_test_kernel("double_d_test_kernel", + {stan::math::internal::double_d_src, + double_d_test_kernel_code}); + +TEST(double_d, opencl) { + using stan::math::internal::double_d; + using VectorXdd = Eigen::Matrix; + int n = 10; + VectorXdd a(n); + Eigen::VectorXd b = Eigen::VectorXd::Random(n); + for (int i = 0; i < n; i++) { + a[i].high = Eigen::VectorXd::Random(1)[0]; + a[i].low = Eigen::VectorXd::Random(1)[0] * 1e-17; + } + stan::math::matrix_cl a_cl(a); + stan::math::matrix_cl b_cl(b); + stan::math::matrix_cl c_cl(n, 1); + double_d_test_kernel(cl::NDRange(n), c_cl, a_cl, b_cl); + VectorXdd c = stan::math::from_matrix_cl(c_cl); + for (int i = 0; i < n; i++) { + double_d correct = a[i] * b[i]; + EXPECT_EQ(c[i].high, correct.high); + EXPECT_NEAR(c[i].low, correct.low, 1e-30); + } +} + +#endif From b5824c2b0f15872daee1a87b854727d33285df6c Mon Sep 17 00:00:00 2001 From: Stan Jenkins Date: Tue, 8 Jun 2021 08:59:30 +0000 Subject: [PATCH 02/15] [Jenkins] auto-formatting by clang-format version 6.0.0-1ubuntu2~16.04.1 (tags/RELEASE_600/final) --- stan/math/opencl/copy.hpp | 3 +-- stan/math/opencl/kernel_generator/matrix_cl_conversion.hpp | 6 ++---- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/stan/math/opencl/copy.hpp b/stan/math/opencl/copy.hpp index 62dd7915879..b61c3f127c8 100644 --- a/stan/math/opencl/copy.hpp +++ b/stan/math/opencl/copy.hpp @@ -55,8 +55,7 @@ inline matrix_cl> to_matrix_cl(T&& src) { * @param src source matrix on the OpenCL device * @return Eigen matrix with a copy of the data in the source matrix */ -template * = nullptr, +template * = nullptr, require_matrix_cl_t* = nullptr, require_st_same* = nullptr> inline auto from_matrix_cl(const T& src) { diff --git a/stan/math/opencl/kernel_generator/matrix_cl_conversion.hpp b/stan/math/opencl/kernel_generator/matrix_cl_conversion.hpp index 02bdbde82d1..fd1455ec91f 100644 --- a/stan/math/opencl/kernel_generator/matrix_cl_conversion.hpp +++ b/stan/math/opencl/kernel_generator/matrix_cl_conversion.hpp @@ -14,16 +14,14 @@ namespace math { template template *> -matrix_cl::matrix_cl(const Expr& expresion) - : rows_(0), cols_(0) { +matrix_cl::matrix_cl(const Expr& expresion) : rows_(0), cols_(0) { results(*this) = expressions(expresion); } template template *> -matrix_cl& matrix_cl::operator=( - const Expr& expresion) { +matrix_cl& matrix_cl::operator=(const Expr& expresion) { results(*this) = expressions(expresion); return *this; } From d69b95fcc4cae1fe9c1d89c5665be18a1e26c80b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= Date: Wed, 9 Jun 2021 14:43:57 +0200 Subject: [PATCH 03/15] fix cpplint --- stan/math/opencl/double_d.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/stan/math/opencl/double_d.hpp b/stan/math/opencl/double_d.hpp index d2587801246..224b96a58ee 100644 --- a/stan/math/opencl/double_d.hpp +++ b/stan/math/opencl/double_d.hpp @@ -274,7 +274,6 @@ const char* double_d_src = STRINGIFY( } inline bool ge_d_dd(double a, double_d b) { return !lt_d_dd(a, b); } inline bool ge_dd_d(double_d a, double b) { return !lt_dd_d(a, b); } - // \cond ); // \endcond From 83cbd33597e1136132c59b7360508bf0749e962c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= Date: Wed, 9 Jun 2021 14:49:22 +0200 Subject: [PATCH 04/15] nolint --- stan/math/opencl/double_d.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stan/math/opencl/double_d.hpp b/stan/math/opencl/double_d.hpp index 224b96a58ee..57693baf46f 100644 --- a/stan/math/opencl/double_d.hpp +++ b/stan/math/opencl/double_d.hpp @@ -275,7 +275,7 @@ const char* double_d_src = STRINGIFY( return !lt_d_dd(a, b); } inline bool ge_dd_d(double_d a, double b) { return !lt_dd_d(a, b); } // \cond - ); + ); //NOLINT(whitespace/parens) // \endcond inline double_d operator+(double_d a, double b) { return add_dd_d(a, b); } From 58dbeb0eb302e482b17ec304c16bcb03cf806cb9 Mon Sep 17 00:00:00 2001 From: Stan Jenkins Date: Wed, 9 Jun 2021 12:55:08 +0000 Subject: [PATCH 05/15] [Jenkins] auto-formatting by clang-format version 6.0.0-1ubuntu2~16.04.1 (tags/RELEASE_600/final) --- stan/math/opencl/double_d.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stan/math/opencl/double_d.hpp b/stan/math/opencl/double_d.hpp index 57693baf46f..f2011c85bb3 100644 --- a/stan/math/opencl/double_d.hpp +++ b/stan/math/opencl/double_d.hpp @@ -275,7 +275,7 @@ const char* double_d_src = STRINGIFY( return !lt_d_dd(a, b); } inline bool ge_dd_d(double_d a, double b) { return !lt_dd_d(a, b); } // \cond - ); //NOLINT(whitespace/parens) + ); // NOLINT(whitespace/parens) // \endcond inline double_d operator+(double_d a, double b) { return add_dd_d(a, b); } From 79ed6663d01620d626554414ecbf481f8fd8b8e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= Date: Wed, 9 Jun 2021 15:07:02 +0200 Subject: [PATCH 06/15] cpplint2 --- test/unit/math/opencl/double_d_test.cpp | 38 ++++++++++++------------- 1 file changed, 18 insertions(+), 20 deletions(-) diff --git a/test/unit/math/opencl/double_d_test.cpp b/test/unit/math/opencl/double_d_test.cpp index 98764b25194..fb7de3f87bf 100644 --- a/test/unit/math/opencl/double_d_test.cpp +++ b/test/unit/math/opencl/double_d_test.cpp @@ -1,6 +1,4 @@ - #include -//#include #include #include #include @@ -21,20 +19,20 @@ TEST(double_d, add_dd_dd_test) { // simple double_d res = add_dd_dd(a, a); EXPECT_NORMALIZED(res); - EXPECT_TRUE(res.high == 2.0); - EXPECT_TRUE(res.low == eps); + EXPECT_TRUE(res.high, 2.0); + EXPECT_TRUE(res.low, eps); // carry res = add_dd_dd(a, c); EXPECT_NORMALIZED(res); - EXPECT_TRUE(res.high == 1.0 + eps); - EXPECT_TRUE(res.low == 0.0); + EXPECT_TRUE(res.high, 1.0 + eps); + EXPECT_TRUE(res.low, 0.0); // cancelation res = add_dd_dd(a, d); EXPECT_NORMALIZED(res); - EXPECT_TRUE(res.high == eps * h_eps); - EXPECT_TRUE(res.low == 0.0); + EXPECT_TRUE(res.high, eps * h_eps); + EXPECT_TRUE(res.low, 0.0); } TEST(double_d, mul_dd_dd_test) { @@ -50,20 +48,20 @@ TEST(double_d, mul_dd_dd_test) { // simple double_d res = mul_dd_dd(a, b); EXPECT_NORMALIZED(res); - EXPECT_TRUE(res.high == h_eps); - EXPECT_TRUE(res.low == h_eps * h_eps * 0.001); + EXPECT_TRUE(res.high, h_eps); + EXPECT_TRUE(res.low, h_eps * h_eps * 0.001); // carry res = mul_dd_dd(c, c); EXPECT_NORMALIZED(res); - EXPECT_TRUE(res.high == 1.0 + eps); - EXPECT_TRUE(res.low == 0.0); + EXPECT_TRUE(res.high, 1.0 + eps); + EXPECT_TRUE(res.low, 0.0); // cancelation res = mul_dd_dd(c, d); EXPECT_NORMALIZED(res); - EXPECT_TRUE(res.high == 1.0); - EXPECT_TRUE(res.low == 0.0); + EXPECT_TRUE(res.high, 1.0); + EXPECT_TRUE(res.low, 0.0); } TEST(double_d, div_dd_dd_test) { @@ -79,18 +77,18 @@ TEST(double_d, div_dd_dd_test) { // simple double_d res = div_dd_dd(a, a); EXPECT_NORMALIZED(res); - EXPECT_TRUE(res.high == 1.0); - EXPECT_TRUE(res.low == 0.0); + EXPECT_TRUE(res.high, 1.0); + EXPECT_TRUE(res.low, 0.0); res = div_dd_dd(a, b); EXPECT_NORMALIZED(res); - EXPECT_TRUE(res.high == 1.0); - EXPECT_TRUE(res.low == h_eps * eps); + EXPECT_TRUE(res.high, 1.0); + EXPECT_TRUE(res.low, h_eps * eps); // div by zero res = div_dd_dd(a, c); - EXPECT_TRUE(res.high == std::numeric_limits::infinity()); - EXPECT_TRUE(res.low == 0); + EXPECT_TRUE(res.high, std::numeric_limits::infinity()); + EXPECT_TRUE(res.low, 0); } // this is the best test we have, but it relies on the nonstandard gcc extension From 52803818972530a8cc0e25d781fb6ee5311546f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= Date: Wed, 9 Jun 2021 15:13:29 +0200 Subject: [PATCH 07/15] typo --- test/unit/math/opencl/double_d_test.cpp | 36 ++++++++++++------------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/test/unit/math/opencl/double_d_test.cpp b/test/unit/math/opencl/double_d_test.cpp index fb7de3f87bf..ffbad5e7ff2 100644 --- a/test/unit/math/opencl/double_d_test.cpp +++ b/test/unit/math/opencl/double_d_test.cpp @@ -19,20 +19,20 @@ TEST(double_d, add_dd_dd_test) { // simple double_d res = add_dd_dd(a, a); EXPECT_NORMALIZED(res); - EXPECT_TRUE(res.high, 2.0); - EXPECT_TRUE(res.low, eps); + EXPECT_EQ(res.high, 2.0); + EXPECT_EQ(res.low, eps); // carry res = add_dd_dd(a, c); EXPECT_NORMALIZED(res); - EXPECT_TRUE(res.high, 1.0 + eps); - EXPECT_TRUE(res.low, 0.0); + EXPECT_EQ(res.high, 1.0 + eps); + EXPECT_EQ(res.low, 0.0); // cancelation res = add_dd_dd(a, d); EXPECT_NORMALIZED(res); - EXPECT_TRUE(res.high, eps * h_eps); - EXPECT_TRUE(res.low, 0.0); + EXPECT_EQ(res.high, eps * h_eps); + EXPECT_EQ(res.low, 0.0); } TEST(double_d, mul_dd_dd_test) { @@ -48,20 +48,20 @@ TEST(double_d, mul_dd_dd_test) { // simple double_d res = mul_dd_dd(a, b); EXPECT_NORMALIZED(res); - EXPECT_TRUE(res.high, h_eps); - EXPECT_TRUE(res.low, h_eps * h_eps * 0.001); + EXPECT_EQ(res.high, h_eps); + EXPECT_EQ(res.low, h_eps * h_eps * 0.001); // carry res = mul_dd_dd(c, c); EXPECT_NORMALIZED(res); - EXPECT_TRUE(res.high, 1.0 + eps); - EXPECT_TRUE(res.low, 0.0); + EXPECT_EQ(res.high, 1.0 + eps); + EXPECT_EQ(res.low, 0.0); // cancelation res = mul_dd_dd(c, d); EXPECT_NORMALIZED(res); - EXPECT_TRUE(res.high, 1.0); - EXPECT_TRUE(res.low, 0.0); + EXPECT_EQ(res.high, 1.0); + EXPECT_EQ(res.low, 0.0); } TEST(double_d, div_dd_dd_test) { @@ -77,18 +77,18 @@ TEST(double_d, div_dd_dd_test) { // simple double_d res = div_dd_dd(a, a); EXPECT_NORMALIZED(res); - EXPECT_TRUE(res.high, 1.0); - EXPECT_TRUE(res.low, 0.0); + EXPECT_EQ(res.high, 1.0); + EXPECT_EQ(res.low, 0.0); res = div_dd_dd(a, b); EXPECT_NORMALIZED(res); - EXPECT_TRUE(res.high, 1.0); - EXPECT_TRUE(res.low, h_eps * eps); + EXPECT_EQ(res.high, 1.0); + EXPECT_EQ(res.low, h_eps * eps); // div by zero res = div_dd_dd(a, c); - EXPECT_TRUE(res.high, std::numeric_limits::infinity()); - EXPECT_TRUE(res.low, 0); + EXPECT_EQ(res.high, std::numeric_limits::infinity()); + EXPECT_EQ(res.low, 0); } // this is the best test we have, but it relies on the nonstandard gcc extension From 604ce6ab0f4fc1ea3625d9c06103302c15f339d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= Date: Wed, 16 Jun 2021 13:26:48 +0200 Subject: [PATCH 08/15] added opencl kernel for reverse mode of indexing --- stan/math/opencl/indexing_rev.hpp | 46 ++++++ .../device_functions/atomic_add_double.hpp | 77 +++++++++ stan/math/opencl/kernels/indexing_rev.hpp | 148 ++++++++++++++++++ test/unit/math/opencl/indexing_rev_test.cpp | 50 ++++++ 4 files changed, 321 insertions(+) create mode 100644 stan/math/opencl/indexing_rev.hpp create mode 100644 stan/math/opencl/kernels/device_functions/atomic_add_double.hpp create mode 100644 stan/math/opencl/kernels/indexing_rev.hpp create mode 100644 test/unit/math/opencl/indexing_rev_test.cpp diff --git a/stan/math/opencl/indexing_rev.hpp b/stan/math/opencl/indexing_rev.hpp new file mode 100644 index 00000000000..2d99680455d --- /dev/null +++ b/stan/math/opencl/indexing_rev.hpp @@ -0,0 +1,46 @@ +#ifndef STAN_MATH_OPENCL_INDEXING_REV_HPP +#define STAN_MATH_OPENCL_INDEXING_REV_HPP +#ifdef STAN_OPENCL + +#include +#include +#include +#include + +namespace stan { +namespace math { + +void indexing_rev(matrix_cl& adj, const matrix_cl& idx, + const matrix_cl& res) { + int local_mem_size + = opencl_context.device()[0].getInfo(); + int preferred_work_groups + = opencl_context.device()[0].getInfo(); + int local_size = 64; + int n_threads = preferred_work_groups * 16 * local_size; + + try { + if (local_mem_size > sizeof(double) * adj.size() * local_size * 2) { + stan::math::opencl_kernels::indexing_rev_local_independent( + cl::NDRange(n_threads), cl::NDRange(local_size), adj, idx, res, + cl::Local(sizeof(double) * adj.size() * local_size), res.size(), + adj.size()); + } else if (local_mem_size > sizeof(double) * adj.size()) { + stan::math::opencl_kernels::indexing_rev_local_atomic( + cl::NDRange(n_threads), cl::NDRange(local_size), adj, idx, res, + cl::Local(sizeof(double) * adj.size()), res.size(), adj.size()); + } else { + stan::math::opencl_kernels::indexing_rev_global_atomic( + cl::NDRange(n_threads), cl::NDRange(local_size), adj, idx, res, + res.size()); + } + } catch (cl::Error& e) { + check_opencl_error("indexing reverse pass", e); + } +} + +} // namespace math +} // namespace stan + +#endif +#endif diff --git a/stan/math/opencl/kernels/device_functions/atomic_add_double.hpp b/stan/math/opencl/kernels/device_functions/atomic_add_double.hpp new file mode 100644 index 00000000000..c17577edcca --- /dev/null +++ b/stan/math/opencl/kernels/device_functions/atomic_add_double.hpp @@ -0,0 +1,77 @@ +#ifndef STAN_MATH_OPENCL_KERNELS_DEVICE_ATOMIC_ADD_DOUBLE_HPP +#define STAN_MATH_OPENCL_KERNELS_DEVICE_ATOMIC_ADD_DOUBLE_HPP +#ifdef STAN_OPENCL + +#include +#include + +namespace stan { +namespace math { +namespace opencl_kernels { +// \cond +static const char *atomic_add_double_device_function + = "\n" + "#ifndef STAN_MATH_OPENCL_KERNELS_DEVICE_FUNCTIONS_ATOMIC_ADD_DOUBLE\n" + "#define STAN_MATH_OPENCL_KERNELS_DEVICE_FUNCTIONS_ATOMIC_ADD_DOUBLE\n" + "#pragma OPENCL EXTENSION cl_khr_int64_base_atomics: enable\n" STRINGIFY( + // \endcond + /** \ingroup opencl_kernels + * Atomically add to a double value. + * + * Code is taken from: + * https://stackoverflow.com/questions/31863587/atomic-operations-with-double-opencl + * + * @param a pointer to value to add to + * @param b value to add + */ + void atomic_add_double(__global double *val, double delta) { + union { + double f; + ulong i; + } old_val; + union { + double f; + ulong i; + } new_val; + do { + old_val.f = *val; + new_val.f = old_val.f + delta; + } while (atom_cmpxchg((volatile __global ulong *)val, old_val.i, + new_val.i) + != old_val.i); + } + /** \ingroup opencl_kernels + * Atomically add to a local double value. + * + * Code is taken from: + * https://stackoverflow.com/questions/31863587/atomic-operations-with-double-opencl + * + * @param a pointer to value to add to + * @param b value to add + */ + void local_atomic_add_double(__local double *val, double delta) { + union { + double f; + ulong i; + } old_val; + union { + double f; + ulong i; + } new_val; + do { + old_val.f = *val; + new_val.f = old_val.f + delta; + } while (atom_cmpxchg((volatile __local ulong *)val, old_val.i, + new_val.i) + != old_val.i); + } + // \cond + ) "\n#endif\n"; // NOLINT +// \endcond + +} // namespace opencl_kernels +} // namespace math +} // namespace stan + +#endif +#endif diff --git a/stan/math/opencl/kernels/indexing_rev.hpp b/stan/math/opencl/kernels/indexing_rev.hpp new file mode 100644 index 00000000000..c29149541fa --- /dev/null +++ b/stan/math/opencl/kernels/indexing_rev.hpp @@ -0,0 +1,148 @@ +#ifndef STAN_MATH_OPENCL_KERNELS_INDEXING_REV_HPP +#define STAN_MATH_OPENCL_KERNELS_INDEXING_REV_HPP +#ifdef STAN_OPENCL + +#include +#include +#include +#include +#include + +namespace stan { +namespace math { +namespace opencl_kernels { + +// \cond +static const std::string indexing_rev_global_atomic_kernel_code = STRINGIFY( + // \endcond + /** \ingroup opencl_kernels + * + * @param[in,out] adj adjoint to increment + * @param index + * @param res adjoint of the result of indexing + * @param batch_size Number of matrices in the batch. + * @note Code is a const char* held in + * add_batch_kernel_code. + */ + __kernel void indexing_rev(__global double* adj, const __global int* index, + const __global double* res, int size) { + const int gid = get_global_id(0); + const int gsize = get_global_size(0); + for (int i = gid; i < size; i += gsize) { + atomic_add_double(adj + index[i], res[i]); + } + } + // \cond +); +// \endcond + +/** \ingroup opencl_kernels + * See the docs for \link kernels/add.hpp add_batch() \endlink + */ +const kernel_cl + indexing_rev_global_atomic("indexing_rev", + {atomic_add_double_device_function, + indexing_rev_global_atomic_kernel_code}); + +// \cond +static const std::string indexing_rev_local_atomic_kernel_code = STRINGIFY( + // \endcond + /** \ingroup opencl_kernels + * + * @param[in,out] adj adjoint to increment + * @param index + * @param res adjoint of the result of indexing + * @param batch_size Number of matrices in the batch. + * @note Code is a const char* held in + * add_batch_kernel_code. + */ + __kernel void indexing_rev(__global double* adj, const __global int* index, + const __global double* res, + __local double* adj_loc, int index_size, + int adj_size) { + const int gid = get_global_id(0); + const int lid = get_local_id(0); + const int gsize = get_global_size(0); + const int lsize = get_local_size(0); + for (int i = lid; i < adj_size; i += lsize) { + adj_loc[i] = 0; + } + barrier(CLK_LOCAL_MEM_FENCE); + for (int i = gid; i < index_size; i += gsize) { + local_atomic_add_double(adj_loc + index[i], res[i]); + } + barrier(CLK_LOCAL_MEM_FENCE); + for (int i = lid; i < adj_size; i += lsize) { + atomic_add_double(adj + i, adj_loc[i]); + } + } + // \cond +); +// \endcond + +/** \ingroup opencl_kernels + * See the docs for \link kernels/add.hpp add_batch() \endlink + */ +const kernel_cl + indexing_rev_local_atomic("indexing_rev", + {atomic_add_double_device_function, + indexing_rev_local_atomic_kernel_code}); + +// \cond +static const std::string indexing_rev_local_independent_kernel_code = STRINGIFY( + // \endcond + /** \ingroup opencl_kernels + * + * @param[in,out] adj adjoint to increment + * @param index + * @param res adjoint of the result of indexing + * @param batch_size Number of matrices in the batch. + * @note Code is a const char* held in + * add_batch_kernel_code. + */ + __kernel void indexing_rev(__global double* adj, const __global int* index, + const __global double* res, + __local double* adj_loc, int index_size, + int adj_size) { + const int gid = get_global_id(0); + const int lid = get_local_id(0); + const int gsize = get_global_size(0); + const int lsize = get_local_size(0); + for (int i = lid; i < adj_size * lsize; i += lsize) { + adj_loc[i] = 0; + } + barrier(CLK_LOCAL_MEM_FENCE); + for (int i = gid; i < index_size; i += gsize) { + adj_loc[index[i] + lid * adj_size] += res[i]; + } + barrier(CLK_LOCAL_MEM_FENCE); + for (int i = lid; i < adj_size; i += lsize) { + double p = adj_loc[i + adj_size]; + for (int j = 2; j < lsize; j++) { + p += adj_loc[i + j * adj_size]; + } + adj_loc[i] += p; + } + barrier(CLK_LOCAL_MEM_FENCE); + for (int i = lid; i < adj_size; i += lsize) { + atomic_add_double(adj + i, adj_loc[i]); + } + } + // \cond +); +// \endcond + +/** \ingroup opencl_kernels + * See the docs for \link kernels/add.hpp add_batch() \endlink + */ +const kernel_cl + indexing_rev_local_independent( + "indexing_rev", {atomic_add_double_device_function, + indexing_rev_local_independent_kernel_code}); +} // namespace opencl_kernels +} // namespace math +} // namespace stan +#endif +#endif diff --git a/test/unit/math/opencl/indexing_rev_test.cpp b/test/unit/math/opencl/indexing_rev_test.cpp new file mode 100644 index 00000000000..d7f1c40400f --- /dev/null +++ b/test/unit/math/opencl/indexing_rev_test.cpp @@ -0,0 +1,50 @@ +#ifdef STAN_OPENCL +#include +#include +#include +#include +#include +#include + +TEST(indexing_rev, indexing_rev_small) { + Eigen::MatrixXd res(3, 3); + res << 1, 2, 3, 4, 5, 6, 7, 8, 9; + Eigen::MatrixXi idx(3, 3); + idx << 1, 2, 1, 0, 1, 2, 1, 1, 1; + Eigen::MatrixXd adj = Eigen::MatrixXd::Zero(3, 1); + Eigen::MatrixXd correct(3, 1); + correct << 4, 33, 8; + stan::math::matrix_cl res_cl(res); + stan::math::matrix_cl idx_cl(idx); + stan::math::matrix_cl adj_cl(adj); + + stan::math::indexing_rev(adj_cl, idx_cl, res_cl); + EXPECT_NEAR_REL(stan::math::from_matrix_cl(adj_cl), correct); +} + +TEST(indexing_rev, indexing_rev_large) { + int N = 377; + + // different sizes of adj ensure all three kernels are tested regardless of + // the OpenCL device + for (int M = 1; M < 1e6; M *= 2) { + Eigen::MatrixXd res = Eigen::MatrixXd::Random(N, N); + Eigen::MatrixXi idx(N, N); + for (int i = 0; i < N * N; i++) { + idx(i) = std::abs(Eigen::MatrixXi::Random(1, 1)(0)) % M; + } + Eigen::MatrixXd adj = Eigen::MatrixXd::Zero(M, 1); + Eigen::MatrixXd correct = adj; + for (int i = 0; i < N * N; i++) { + correct(idx(i)) += res(i); + } + stan::math::matrix_cl res_cl(res); + stan::math::matrix_cl idx_cl(idx); + stan::math::matrix_cl adj_cl(adj); + + stan::math::indexing_rev(adj_cl, idx_cl, res_cl); + EXPECT_NEAR_REL(stan::math::from_matrix_cl(adj_cl), correct); + } +} + +#endif From d3bb4b3a569ec339831a93e0bff6adf5786fc2da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= Date: Mon, 21 Jun 2021 12:27:48 +0200 Subject: [PATCH 09/15] added is_rev_kernel_expression --- stan/math/prim/meta/is_kernel_expression.hpp | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/stan/math/prim/meta/is_kernel_expression.hpp b/stan/math/prim/meta/is_kernel_expression.hpp index 8dde7061598..1063302f41a 100644 --- a/stan/math/prim/meta/is_kernel_expression.hpp +++ b/stan/math/prim/meta/is_kernel_expression.hpp @@ -76,15 +76,20 @@ struct is_kernel_expression_lhs template struct is_kernel_expression_lhs> : std::true_type {}; +/** + * Determines whether a type is a var containing a kernel generator expression. + */ +template +struct is_rev_kernel_expression + : math::conjunction, is_kernel_expression>> {}; + /** * Determines whether a type is either a kernel generator * expression or a var containing a kernel generator expression. */ template struct is_prim_or_rev_kernel_expression - : math::disjunction< - is_kernel_expression, - math::conjunction, is_kernel_expression>>> { + : math::disjunction, is_rev_kernel_expression> { }; /** @@ -101,6 +106,8 @@ struct is_nonscalar_prim_or_rev_kernel_expression /** @}*/ STAN_ADD_REQUIRE_UNARY(kernel_expression_lhs, is_kernel_expression_lhs, opencl_kernel_generator); +STAN_ADD_REQUIRE_UNARY(rev_kernel_expression, is_rev_kernel_expression, + opencl_kernel_generator); STAN_ADD_REQUIRE_UNARY(prim_or_rev_kernel_expression, is_prim_or_rev_kernel_expression, opencl_kernel_generator); From c4887becad0bdaf041237618c151cc72bd3ebd08 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= Date: Mon, 21 Jun 2021 12:27:48 +0200 Subject: [PATCH 10/15] added is_rev_kernel_expression --- stan/math/opencl/indexing_rev.hpp | 9 +++++++++ stan/math/opencl/kernels/indexing_rev.hpp | 19 +++++++++++++++++++ stan/math/prim/meta/is_kernel_expression.hpp | 13 ++++++++++--- 3 files changed, 38 insertions(+), 3 deletions(-) diff --git a/stan/math/opencl/indexing_rev.hpp b/stan/math/opencl/indexing_rev.hpp index 2d99680455d..94be70c2c68 100644 --- a/stan/math/opencl/indexing_rev.hpp +++ b/stan/math/opencl/indexing_rev.hpp @@ -10,6 +10,15 @@ namespace stan { namespace math { +/** + * Performs reverse pass for indexing operation on the OpenCL device. Depending + * on the size of indexed matrix and the amount of local memory available on the + * device selects the best kernel to use for the operation. + * + * @param[in,out] adj adjoint of the argument to indexing + * @param idx indices + * @param res adjoint of the result of the indexing operation + */ void indexing_rev(matrix_cl& adj, const matrix_cl& idx, const matrix_cl& res) { int local_mem_size diff --git a/stan/math/opencl/kernels/indexing_rev.hpp b/stan/math/opencl/kernels/indexing_rev.hpp index c29149541fa..f4a601396e7 100644 --- a/stan/math/opencl/kernels/indexing_rev.hpp +++ b/stan/math/opencl/kernels/indexing_rev.hpp @@ -16,6 +16,12 @@ namespace opencl_kernels { static const std::string indexing_rev_global_atomic_kernel_code = STRINGIFY( // \endcond /** \ingroup opencl_kernels + * + * Increments adjoint of the indexing operation argument given the indices + * and adjoints of the indexing result. + * + * This kernel uses global atomics and is the fastest for large sizes of the + * indexed matrix. * * @param[in,out] adj adjoint to increment * @param index @@ -48,6 +54,12 @@ const kernel_cl static const std::string indexing_rev_local_atomic_kernel_code = STRINGIFY( // \endcond /** \ingroup opencl_kernels + * + * Increments adjoint of the indexing operation argument given the indices + * and adjoints of the indexing result. + * + * This kernel uses local atomics and is the fastest for medium sizes of the + * indexed matrix (and will not work for large sizes). * * @param[in,out] adj adjoint to increment * @param index @@ -93,6 +105,13 @@ const kernel_cl struct is_kernel_expression_lhs> : std::true_type {}; +/** + * Determines whether a type is a var containing a kernel generator expression. + */ +template +struct is_rev_kernel_expression + : math::conjunction, is_kernel_expression>> {}; + /** * Determines whether a type is either a kernel generator * expression or a var containing a kernel generator expression. */ template struct is_prim_or_rev_kernel_expression - : math::disjunction< - is_kernel_expression, - math::conjunction, is_kernel_expression>>> { + : math::disjunction, is_rev_kernel_expression> { }; /** @@ -101,6 +106,8 @@ struct is_nonscalar_prim_or_rev_kernel_expression /** @}*/ STAN_ADD_REQUIRE_UNARY(kernel_expression_lhs, is_kernel_expression_lhs, opencl_kernel_generator); +STAN_ADD_REQUIRE_UNARY(rev_kernel_expression, is_rev_kernel_expression, + opencl_kernel_generator); STAN_ADD_REQUIRE_UNARY(prim_or_rev_kernel_expression, is_prim_or_rev_kernel_expression, opencl_kernel_generator); From c76cfa6d91ecf285204b3b131d1cf3205aece465 Mon Sep 17 00:00:00 2001 From: Stan Jenkins Date: Mon, 21 Jun 2021 11:22:26 +0000 Subject: [PATCH 11/15] [Jenkins] auto-formatting by clang-format version 6.0.0-1ubuntu2~16.04.1 (tags/RELEASE_600/final) --- .../device_functions/atomic_add_double.hpp | 100 +++++++++--------- 1 file changed, 50 insertions(+), 50 deletions(-) diff --git a/stan/math/opencl/kernels/device_functions/atomic_add_double.hpp b/stan/math/opencl/kernels/device_functions/atomic_add_double.hpp index c17577edcca..8ae6977ba6b 100644 --- a/stan/math/opencl/kernels/device_functions/atomic_add_double.hpp +++ b/stan/math/opencl/kernels/device_functions/atomic_add_double.hpp @@ -15,56 +15,56 @@ static const char *atomic_add_double_device_function "#define STAN_MATH_OPENCL_KERNELS_DEVICE_FUNCTIONS_ATOMIC_ADD_DOUBLE\n" "#pragma OPENCL EXTENSION cl_khr_int64_base_atomics: enable\n" STRINGIFY( // \endcond - /** \ingroup opencl_kernels - * Atomically add to a double value. - * - * Code is taken from: - * https://stackoverflow.com/questions/31863587/atomic-operations-with-double-opencl - * - * @param a pointer to value to add to - * @param b value to add - */ - void atomic_add_double(__global double *val, double delta) { - union { - double f; - ulong i; - } old_val; - union { - double f; - ulong i; - } new_val; - do { - old_val.f = *val; - new_val.f = old_val.f + delta; - } while (atom_cmpxchg((volatile __global ulong *)val, old_val.i, - new_val.i) - != old_val.i); - } - /** \ingroup opencl_kernels - * Atomically add to a local double value. - * - * Code is taken from: - * https://stackoverflow.com/questions/31863587/atomic-operations-with-double-opencl - * - * @param a pointer to value to add to - * @param b value to add - */ - void local_atomic_add_double(__local double *val, double delta) { - union { - double f; - ulong i; - } old_val; - union { - double f; - ulong i; - } new_val; - do { - old_val.f = *val; - new_val.f = old_val.f + delta; - } while (atom_cmpxchg((volatile __local ulong *)val, old_val.i, - new_val.i) - != old_val.i); - } + /** \ingroup opencl_kernels + * Atomically add to a double value. + * + * Code is taken from: + * https://stackoverflow.com/questions/31863587/atomic-operations-with-double-opencl + * + * @param a pointer to value to add to + * @param b value to add + */ + void atomic_add_double(__global double *val, double delta) { + union { + double f; + ulong i; + } old_val; + union { + double f; + ulong i; + } new_val; + do { + old_val.f = *val; + new_val.f = old_val.f + delta; + } while (atom_cmpxchg((volatile __global ulong *)val, old_val.i, + new_val.i) + != old_val.i); + } + /** \ingroup opencl_kernels + * Atomically add to a local double value. + * + * Code is taken from: + * https://stackoverflow.com/questions/31863587/atomic-operations-with-double-opencl + * + * @param a pointer to value to add to + * @param b value to add + */ + void local_atomic_add_double(__local double *val, double delta) { + union { + double f; + ulong i; + } old_val; + union { + double f; + ulong i; + } new_val; + do { + old_val.f = *val; + new_val.f = old_val.f + delta; + } while (atom_cmpxchg((volatile __local ulong *)val, old_val.i, + new_val.i) + != old_val.i); + } // \cond ) "\n#endif\n"; // NOLINT // \endcond From a5fe75263881ca83948a1f5d7f25667f552598f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= Date: Mon, 21 Jun 2021 16:13:05 +0200 Subject: [PATCH 12/15] fixed fma multiplication --- stan/math/opencl/double_d.hpp | 90 ++++++++++++++++++++--------------- 1 file changed, 52 insertions(+), 38 deletions(-) diff --git a/stan/math/opencl/double_d.hpp b/stan/math/opencl/double_d.hpp index f2011c85bb3..99dcba23a1d 100644 --- a/stan/math/opencl/double_d.hpp +++ b/stan/math/opencl/double_d.hpp @@ -44,6 +44,47 @@ using std::isinf; using std::isnan; using std::ldexp; +inline double_d mul_d_d(double a, double b) { + double_d res; + // for some reason the std::fma only gives accurate result if compiling for + // instruction set that includes a fma instruction +#if defined(__FMA__) + res.high = a * b; + if (isfinite(res.high)) { + res.low = fma(a, b, -res.high); + } else { + res.low = 0; + } +#else + const double C = (2 << 26) + 1; + double p = a * C; + if (!isfinite(p)) { + res.high = p; + res.low = 0; + return res; + } + double hx = a - p + p; + double tx = a - hx; + p = b * C; + if (!isfinite(p)) { + res.high = p; + res.low = 0; + return res; + } + double hy = b - p + p; + double ty = b - hy; + p = hx * hy; + double q = hx * ty + tx * hy; + res.high = p + q; + if (isfinite(res.high)) { + res.low = p - res.high + q + tx * ty; + } else { + res.low = 0; + } +#endif + return res; +} + // \cond const char* double_d_src = STRINGIFY( // \endcond @@ -51,49 +92,22 @@ const char* double_d_src = STRINGIFY( double high; double low; } double_d; + + inline double_d mul_d_d(double a, double b) { + double_d res; + res.high = a * b; + if (isfinite(res.high)) { + res.low = fma(a, b, -res.high); + } else { + res.low = 0; + } + return res; + } // \cond ) STRINGIFY2( // \endcond - // inline double_d mul_d_d(double a, double b) { - // double_d res; - // volatile double temp = a * b; - // res.high = temp; - // res.low = fma(a, b, -temp); - // return res; - // } - - inline double_d mul_d_d(double a, double b) { - double_d res; - const double C = (2 << 26) + 1; - double p = a * C; - if (!isfinite(p)) { - res.high = p; - res.low = 0; - return res; - } - double hx = a - p + p; - double tx = a - hx; - p = b * C; - if (!isfinite(p)) { - res.high = p; - res.low = 0; - return res; - } - double hy = b - p + p; - double ty = b - hy; - p = hx * hy; - double q = hx * ty + tx * hy; - res.high = p + q; - if (isfinite(res.high)) { - res.low = p - res.high + q + tx * ty; - } else { - res.low = 0; - } - return res; - } - inline double_d neg(double_d a) { double_d res; res.high = -a.high; From f085f04cc13c1ccdbe4cd30082512db9c97205fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= Date: Tue, 22 Jun 2021 11:42:41 +0200 Subject: [PATCH 13/15] added indexing to opencl vari --- stan/math/opencl/rev/vari.hpp | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/stan/math/opencl/rev/vari.hpp b/stan/math/opencl/rev/vari.hpp index 5f5ef03165c..cd35d14c0f0 100644 --- a/stan/math/opencl/rev/vari.hpp +++ b/stan/math/opencl/rev/vari.hpp @@ -108,6 +108,27 @@ class vari_cl_base : public vari_base { std::move(adj_t)); } + /** + * Return indexed view into a matrix. + * + * Do not use with indices that reference any element of this more than once - + * that cans cause data races in rev operations on the result! + * + * @param row_index kg expression used for row index + * @param col_index kg expression used for column index + */ + template + auto index(const RowIndex& row_index, const ColIndex& col_index) { + RowIndex r1 = row_index; + RowIndex r2 = row_index; + ColIndex c1 = col_index; + ColIndex c2 = col_index; + auto&& val_t = stan::math::indexing(val_, std::move(r1), std::move(c1)); + auto&& adj_t = stan::math::indexing(adj_, std::move(r2), std::move(c2)); + return vari_view>(std::move(val_t), + std::move(adj_t)); + } + /** * Return the number of rows for this class's `val_` member */ From 64a557d34cb7a4032afc39fbdd0b503d1cbdb2b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= Date: Thu, 24 Jun 2021 07:33:10 +0200 Subject: [PATCH 14/15] fix val_op --- stan/math/rev/core/var.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stan/math/rev/core/var.hpp b/stan/math/rev/core/var.hpp index 26c78556d5b..0c7ade74951 100644 --- a/stan/math/rev/core/var.hpp +++ b/stan/math/rev/core/var.hpp @@ -401,7 +401,7 @@ class var_value< * @return The value of this variable. */ inline const auto& val() const { return vi_->val(); } - inline auto& val_op() { return vi_->val(); } + inline auto& val_op() { return vi_->val_op(); } /** * Return a reference to the derivative of the root expression with From 6d39b25198ae0faf3b9e81cb83c156b969e3a2da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= Date: Mon, 28 Jun 2021 14:11:25 +0200 Subject: [PATCH 15/15] fix --- benchmarks/benchmark.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/benchmarks/benchmark.py b/benchmarks/benchmark.py index 7d3aacb6581..f6e1158277e 100755 --- a/benchmarks/benchmark.py +++ b/benchmarks/benchmark.py @@ -424,13 +424,16 @@ def benchmark( special_arg_values[function_name][n], numbers.Number ): value = special_arg_values[function_name][n] - if scalar == "double": + if not is_argument_autodiff or ( + not is_argument_scalar and (opencl == "base" or varmat == "base") + ): + arg_type_prim = cpp_arg_template.replace("SCALAR", "double"); setup += ( " {} {} = stan::test::{}<{}>({}, state.range(0));\n".format( - arg_type, + arg_type_prim, var_name, make_arg_function, - arg_type, + arg_type_prim, value, ) ) @@ -440,9 +443,15 @@ def benchmark( var_name + "_cl", var_name ) var_name += "_cl" + if is_argument_autodiff: + var_conversions += ( + " stan::math::var_value> {}({});\n".format( + var_name + "_var", var_name) + ) + var_name += "_var" elif varmat == "base" and arg_overload == "Rev": - setup += " auto {} = stan::math::to_var_value({});\n".format( - var_name + "_varmat", var_name + var_conversions += " stan::math::var_value<{}> {}({});\n".format( + arg_type_prim, var_name + "_varmat", var_name ) var_name += "_varmat" else: