diff --git a/benchmarks/benchmark.py b/benchmarks/benchmark.py index 7d3aacb6581..f6e1158277e 100755 --- a/benchmarks/benchmark.py +++ b/benchmarks/benchmark.py @@ -424,13 +424,16 @@ def benchmark( special_arg_values[function_name][n], numbers.Number ): value = special_arg_values[function_name][n] - if scalar == "double": + if not is_argument_autodiff or ( + not is_argument_scalar and (opencl == "base" or varmat == "base") + ): + arg_type_prim = cpp_arg_template.replace("SCALAR", "double"); setup += ( " {} {} = stan::test::{}<{}>({}, state.range(0));\n".format( - arg_type, + arg_type_prim, var_name, make_arg_function, - arg_type, + arg_type_prim, value, ) ) @@ -440,9 +443,15 @@ def benchmark( var_name + "_cl", var_name ) var_name += "_cl" + if is_argument_autodiff: + var_conversions += ( + " stan::math::var_value> {}({});\n".format( + var_name + "_var", var_name) + ) + var_name += "_var" elif varmat == "base" and arg_overload == "Rev": - setup += " auto {} = stan::math::to_var_value({});\n".format( - var_name + "_varmat", var_name + var_conversions += " stan::math::var_value<{}> {}({});\n".format( + arg_type_prim, var_name + "_varmat", var_name ) var_name += "_varmat" else: diff --git a/stan/math/opencl/copy.hpp b/stan/math/opencl/copy.hpp index 574d1734b0a..b61c3f127c8 100644 --- a/stan/math/opencl/copy.hpp +++ b/stan/math/opencl/copy.hpp @@ -55,8 +55,7 @@ inline matrix_cl> to_matrix_cl(T&& src) { * @param src source matrix on the OpenCL device * @return Eigen matrix with a copy of the data in the source matrix */ -template * = nullptr, +template * = nullptr, require_matrix_cl_t* = nullptr, require_st_same* = nullptr> inline auto from_matrix_cl(const T& src) { diff --git a/stan/math/opencl/double_d.hpp b/stan/math/opencl/double_d.hpp new file mode 100644 index 00000000000..99dcba23a1d --- /dev/null +++ b/stan/math/opencl/double_d.hpp @@ -0,0 +1,335 @@ +#ifndef STAN_MATH_OPENCL_DOUBLE_D_HPP +#define STAN_MATH_OPENCL_DOUBLE_D_HPP + +#include + +#include +#include +#include +#include + +/** + * Alternative stringify, that also exposes the code for use in C++ host code. + */ +#define STRINGIFY2(A) \ +#A; \ + A +namespace stan { +namespace math { +namespace internal { + +/** + * Double double - a 128 bit floating point number defined as an exact sum of 2 + * doubles. `low` should be less than 1 ulp of `high`. + */ +typedef struct double_d { + double high; + double low; + + double_d() {} + double_d(double a) { + high = a; + low = 0; + } + double_d(double h, double l) { + high = h; + low = l; + } +} double_d; + +using std::copysign; +using std::fabs; +using std::isfinite; +using std::isinf; +using std::isnan; +using std::ldexp; + +inline double_d mul_d_d(double a, double b) { + double_d res; + // for some reason the std::fma only gives accurate result if compiling for + // instruction set that includes a fma instruction +#if defined(__FMA__) + res.high = a * b; + if (isfinite(res.high)) { + res.low = fma(a, b, -res.high); + } else { + res.low = 0; + } +#else + const double C = (2 << 26) + 1; + double p = a * C; + if (!isfinite(p)) { + res.high = p; + res.low = 0; + return res; + } + double hx = a - p + p; + double tx = a - hx; + p = b * C; + if (!isfinite(p)) { + res.high = p; + res.low = 0; + return res; + } + double hy = b - p + p; + double ty = b - hy; + p = hx * hy; + double q = hx * ty + tx * hy; + res.high = p + q; + if (isfinite(res.high)) { + res.low = p - res.high + q + tx * ty; + } else { + res.low = 0; + } +#endif + return res; +} + +// \cond +const char* double_d_src = STRINGIFY( + // \endcond + typedef struct { + double high; + double low; + } double_d; + + inline double_d mul_d_d(double a, double b) { + double_d res; + res.high = a * b; + if (isfinite(res.high)) { + res.low = fma(a, b, -res.high); + } else { + res.low = 0; + } + return res; + } + // \cond + ) + STRINGIFY2( + // \endcond + + inline double_d neg(double_d a) { + double_d res; + res.high = -a.high; + res.low = -a.low; + return res; + } + + inline double_d add_dd_dd(double_d a, double_d b) { + double_d res; + double high = a.high + b.high; + if (isfinite(high)) { + double low; + if (fabs(a.high) > fabs(b.high)) { + low = a.high - high + b.high; + } else { + low = b.high - high + a.high; + } + low += a.low + b.low; + res.high = high + low; + res.low = high - res.high + low; + } else { + res.high = high; + res.low = 0; + } + return res; + } + + inline double_d add_dd_d(double_d a, double b) { + double_d res; + double high = a.high + b; + if (isfinite(high)) { + double low; + if (fabs(a.high) > fabs(b)) { + low = a.high - high + b; + } else { + low = b - high + a.high; + } + low += a.low; + res.high = high + low; + res.low = high - res.high + low; + } else { + res.high = high; + res.low = 0; + } + return res; + } + + inline double_d sub_dd_dd(double_d a, double_d b) { + return add_dd_dd(a, neg(b)); + } + + inline double_d sub_dd_d(double_d a, double b) { + return add_dd_d(a, -b); + } + + inline double_d sub_d_dd(double a, double_d b) { + return add_dd_d(neg(b), a); + } + + inline double_d mul_dd_dd(double_d a, double_d b) { + double_d high = mul_d_d(a.high, b.high); + double_d res; + if (isfinite(high.high)) { + high.low += a.high * b.low + a.low * b.high; + res.high = high.high + high.low; + res.low = high.high - res.high + high.low; + } else { + high.low = 0; + return high; + } + return res; + } + + inline double_d mul_dd_d(double_d a, double b) { + double_d high = mul_d_d(a.high, b); + double_d res; + if (isfinite(high.high)) { + high.low += a.low * b; + res.high = high.high + high.low; + res.low = high.high - res.high + high.low; + return res; + } else { + high.low = 0; + return high; + } + } + + inline double_d div_dd_dd(double_d a, double_d b) { + double high = a.high / b.high; + double_d res; + if (isfinite(high)) { + double_d u = mul_d_d(high, b.high); + if (isfinite(u.high)) { + double low + = (a.high - u.high - u.low + a.low - high * b.low) / b.high; + res.high = high + low; + res.low = high - res.high + low; + return res; + } + } + res.high = high; + res.low = 0; + return res; + } + + inline double_d div_dd_d(double_d a, double b) { + double high = a.high / b; + double_d res; + if (isfinite(high)) { + double_d u = mul_d_d(high, b); + double low = (a.high - u.high - u.low + a.low) / b; + res.high = high + low; + res.low = high - res.high + low; + } else { + res.high = high; + res.low = 0; + } + return res; + } + + inline double_d div_d_dd(double a, double_d b) { + double high = a / b.high; + double_d res; + if (isfinite(high)) { + double_d u = mul_d_d(high, b.high); + double low = (a - u.high - u.low - high * b.low) / b.high; + res.high = high + low; + res.low = high - res.high + low; + } else { + res.high = high; + res.low = 0; + } + return res; + } + + inline double copysign_d_dd(double a, double_d b) { + return copysign(a, b.high); + } + + inline double_d abs_dd(double_d a) { return a.high > 0 ? a : neg(a); } + + inline bool isnan_dd(double_d a) { return isnan(a.high); } + + inline bool isinf_dd(double_d a) { return isinf(a.high); } + + inline bool lt_dd_dd(double_d a, double_d b) { + return a.high < b.high || (a.high == b.high && a.low < b.low); + } + + inline bool lt_d_dd(double a, double_d b) { + return lt_dd_dd((double_d){a, 0}, b); + } + + inline bool lt_dd_d(double_d a, double b) { + return lt_dd_dd(a, (double_d){b, 0}); + } + + inline bool gt_dd_dd(double_d a, double_d b) { + return a.high > b.high || (a.high == b.high && a.low > b.low); + } + + inline bool gt_d_dd(double a, double_d b) { + return gt_dd_dd((double_d){a, 0}, b); + } + + inline bool gt_dd_d(double_d a, double b) { + return gt_dd_dd(a, (double_d){b, 0}); + } + + inline bool le_dd_dd(double_d a, double_d b) { + return !gt_dd_dd(a, b); + } inline bool le_d_dd(double a, double_d b) { + return !gt_d_dd(a, b); + } inline bool le_dd_d(double_d a, double b) { return !gt_dd_d(a, b); } + + inline bool ge_dd_dd(double_d a, double_d b) { + return !lt_dd_dd(a, b); + } inline bool ge_d_dd(double a, double_d b) { + return !lt_d_dd(a, b); + } inline bool ge_dd_d(double_d a, double b) { return !lt_dd_d(a, b); } + // \cond + ); // NOLINT(whitespace/parens) +// \endcond + +inline double_d operator+(double_d a, double b) { return add_dd_d(a, b); } +inline double_d operator+(double a, double_d b) { return add_dd_d(b, a); } +inline double_d operator+(double_d a, double_d b) { return add_dd_dd(a, b); } + +inline double_d operator-(double a, double_d b) { return sub_d_dd(a, b); } +inline double_d operator-(double_d a, double b) { return sub_dd_d(a, b); } +inline double_d operator-(double_d a, double_d b) { return sub_dd_dd(a, b); } + +inline double_d operator*(double_d a, double b) { return mul_dd_d(a, b); } +inline double_d operator*(double a, double_d b) { return mul_dd_d(b, a); } +inline double_d operator*(double_d a, double_d b) { return mul_dd_dd(a, b); } + +inline double_d operator/(double_d a, double b) { return div_dd_d(a, b); } +inline double_d operator/(double a, double_d b) { return div_d_dd(a, b); } +inline double_d operator/(double_d a, double_d b) { return div_dd_dd(a, b); } + +inline double_d operator-(double_d a) { return neg(a); } + +inline bool operator<(double_d a, double_d b) { return lt_dd_dd(a, b); } +inline bool operator<(double a, double_d b) { return lt_d_dd(a, b); } +inline bool operator<(double_d a, double b) { return lt_dd_d(a, b); } +inline bool operator>(double_d a, double_d b) { return gt_dd_dd(a, b); } +inline bool operator>(double a, double_d b) { return gt_d_dd(a, b); } +inline bool operator>(double_d a, double b) { return gt_dd_d(a, b); } +inline bool operator<=(double_d a, double_d b) { return le_dd_dd(a, b); } +inline bool operator<=(double a, double_d b) { return le_d_dd(a, b); } +inline bool operator<=(double_d a, double b) { return le_dd_d(a, b); } +inline bool operator>=(double_d a, double_d b) { return ge_dd_dd(a, b); } +inline bool operator>=(double a, double_d b) { return ge_d_dd(a, b); } +inline bool operator>=(double_d a, double b) { return ge_dd_d(a, b); } + +inline double_d fabs(double_d a) { return abs_dd(a); } +inline bool isnan(double_d a) { return isnan_dd(a); } +inline bool isinf(double_d a) { return isinf_dd(a); } +inline double copysign(double a, double_d b) { return copysign_d_dd(a, b); } + +} // namespace internal +} // namespace math +} // namespace stan + +#endif diff --git a/stan/math/opencl/indexing_rev.hpp b/stan/math/opencl/indexing_rev.hpp new file mode 100644 index 00000000000..94be70c2c68 --- /dev/null +++ b/stan/math/opencl/indexing_rev.hpp @@ -0,0 +1,55 @@ +#ifndef STAN_MATH_OPENCL_INDEXING_REV_HPP +#define STAN_MATH_OPENCL_INDEXING_REV_HPP +#ifdef STAN_OPENCL + +#include +#include +#include +#include + +namespace stan { +namespace math { + +/** + * Performs reverse pass for indexing operation on the OpenCL device. Depending + * on the size of indexed matrix and the amount of local memory available on the + * device selects the best kernel to use for the operation. + * + * @param[in,out] adj adjoint of the argument to indexing + * @param idx indices + * @param res adjoint of the result of the indexing operation + */ +void indexing_rev(matrix_cl& adj, const matrix_cl& idx, + const matrix_cl& res) { + int local_mem_size + = opencl_context.device()[0].getInfo(); + int preferred_work_groups + = opencl_context.device()[0].getInfo(); + int local_size = 64; + int n_threads = preferred_work_groups * 16 * local_size; + + try { + if (local_mem_size > sizeof(double) * adj.size() * local_size * 2) { + stan::math::opencl_kernels::indexing_rev_local_independent( + cl::NDRange(n_threads), cl::NDRange(local_size), adj, idx, res, + cl::Local(sizeof(double) * adj.size() * local_size), res.size(), + adj.size()); + } else if (local_mem_size > sizeof(double) * adj.size()) { + stan::math::opencl_kernels::indexing_rev_local_atomic( + cl::NDRange(n_threads), cl::NDRange(local_size), adj, idx, res, + cl::Local(sizeof(double) * adj.size()), res.size(), adj.size()); + } else { + stan::math::opencl_kernels::indexing_rev_global_atomic( + cl::NDRange(n_threads), cl::NDRange(local_size), adj, idx, res, + res.size()); + } + } catch (cl::Error& e) { + check_opencl_error("indexing reverse pass", e); + } +} + +} // namespace math +} // namespace stan + +#endif +#endif diff --git a/stan/math/opencl/kernel_generator/matrix_cl_conversion.hpp b/stan/math/opencl/kernel_generator/matrix_cl_conversion.hpp index c91e6a62a05..fd1455ec91f 100644 --- a/stan/math/opencl/kernel_generator/matrix_cl_conversion.hpp +++ b/stan/math/opencl/kernel_generator/matrix_cl_conversion.hpp @@ -14,16 +14,14 @@ namespace math { template template *> -matrix_cl>::matrix_cl(const Expr& expresion) - : rows_(0), cols_(0) { +matrix_cl::matrix_cl(const Expr& expresion) : rows_(0), cols_(0) { results(*this) = expressions(expresion); } template template *> -matrix_cl& matrix_cl>::operator=( - const Expr& expresion) { +matrix_cl& matrix_cl::operator=(const Expr& expresion) { results(*this) = expressions(expresion); return *this; } diff --git a/stan/math/opencl/kernels/device_functions/atomic_add_double.hpp b/stan/math/opencl/kernels/device_functions/atomic_add_double.hpp new file mode 100644 index 00000000000..8ae6977ba6b --- /dev/null +++ b/stan/math/opencl/kernels/device_functions/atomic_add_double.hpp @@ -0,0 +1,77 @@ +#ifndef STAN_MATH_OPENCL_KERNELS_DEVICE_ATOMIC_ADD_DOUBLE_HPP +#define STAN_MATH_OPENCL_KERNELS_DEVICE_ATOMIC_ADD_DOUBLE_HPP +#ifdef STAN_OPENCL + +#include +#include + +namespace stan { +namespace math { +namespace opencl_kernels { +// \cond +static const char *atomic_add_double_device_function + = "\n" + "#ifndef STAN_MATH_OPENCL_KERNELS_DEVICE_FUNCTIONS_ATOMIC_ADD_DOUBLE\n" + "#define STAN_MATH_OPENCL_KERNELS_DEVICE_FUNCTIONS_ATOMIC_ADD_DOUBLE\n" + "#pragma OPENCL EXTENSION cl_khr_int64_base_atomics: enable\n" STRINGIFY( + // \endcond + /** \ingroup opencl_kernels + * Atomically add to a double value. + * + * Code is taken from: + * https://stackoverflow.com/questions/31863587/atomic-operations-with-double-opencl + * + * @param a pointer to value to add to + * @param b value to add + */ + void atomic_add_double(__global double *val, double delta) { + union { + double f; + ulong i; + } old_val; + union { + double f; + ulong i; + } new_val; + do { + old_val.f = *val; + new_val.f = old_val.f + delta; + } while (atom_cmpxchg((volatile __global ulong *)val, old_val.i, + new_val.i) + != old_val.i); + } + /** \ingroup opencl_kernels + * Atomically add to a local double value. + * + * Code is taken from: + * https://stackoverflow.com/questions/31863587/atomic-operations-with-double-opencl + * + * @param a pointer to value to add to + * @param b value to add + */ + void local_atomic_add_double(__local double *val, double delta) { + union { + double f; + ulong i; + } old_val; + union { + double f; + ulong i; + } new_val; + do { + old_val.f = *val; + new_val.f = old_val.f + delta; + } while (atom_cmpxchg((volatile __local ulong *)val, old_val.i, + new_val.i) + != old_val.i); + } + // \cond + ) "\n#endif\n"; // NOLINT +// \endcond + +} // namespace opencl_kernels +} // namespace math +} // namespace stan + +#endif +#endif diff --git a/stan/math/opencl/kernels/indexing_rev.hpp b/stan/math/opencl/kernels/indexing_rev.hpp new file mode 100644 index 00000000000..f4a601396e7 --- /dev/null +++ b/stan/math/opencl/kernels/indexing_rev.hpp @@ -0,0 +1,167 @@ +#ifndef STAN_MATH_OPENCL_KERNELS_INDEXING_REV_HPP +#define STAN_MATH_OPENCL_KERNELS_INDEXING_REV_HPP +#ifdef STAN_OPENCL + +#include +#include +#include +#include +#include + +namespace stan { +namespace math { +namespace opencl_kernels { + +// \cond +static const std::string indexing_rev_global_atomic_kernel_code = STRINGIFY( + // \endcond + /** \ingroup opencl_kernels + * + * Increments adjoint of the indexing operation argument given the indices + * and adjoints of the indexing result. + * + * This kernel uses global atomics and is the fastest for large sizes of the + * indexed matrix. + * + * @param[in,out] adj adjoint to increment + * @param index + * @param res adjoint of the result of indexing + * @param batch_size Number of matrices in the batch. + * @note Code is a const char* held in + * add_batch_kernel_code. + */ + __kernel void indexing_rev(__global double* adj, const __global int* index, + const __global double* res, int size) { + const int gid = get_global_id(0); + const int gsize = get_global_size(0); + for (int i = gid; i < size; i += gsize) { + atomic_add_double(adj + index[i], res[i]); + } + } + // \cond +); +// \endcond + +/** \ingroup opencl_kernels + * See the docs for \link kernels/add.hpp add_batch() \endlink + */ +const kernel_cl + indexing_rev_global_atomic("indexing_rev", + {atomic_add_double_device_function, + indexing_rev_global_atomic_kernel_code}); + +// \cond +static const std::string indexing_rev_local_atomic_kernel_code = STRINGIFY( + // \endcond + /** \ingroup opencl_kernels + * + * Increments adjoint of the indexing operation argument given the indices + * and adjoints of the indexing result. + * + * This kernel uses local atomics and is the fastest for medium sizes of the + * indexed matrix (and will not work for large sizes). + * + * @param[in,out] adj adjoint to increment + * @param index + * @param res adjoint of the result of indexing + * @param batch_size Number of matrices in the batch. + * @note Code is a const char* held in + * add_batch_kernel_code. + */ + __kernel void indexing_rev(__global double* adj, const __global int* index, + const __global double* res, + __local double* adj_loc, int index_size, + int adj_size) { + const int gid = get_global_id(0); + const int lid = get_local_id(0); + const int gsize = get_global_size(0); + const int lsize = get_local_size(0); + for (int i = lid; i < adj_size; i += lsize) { + adj_loc[i] = 0; + } + barrier(CLK_LOCAL_MEM_FENCE); + for (int i = gid; i < index_size; i += gsize) { + local_atomic_add_double(adj_loc + index[i], res[i]); + } + barrier(CLK_LOCAL_MEM_FENCE); + for (int i = lid; i < adj_size; i += lsize) { + atomic_add_double(adj + i, adj_loc[i]); + } + } + // \cond +); +// \endcond + +/** \ingroup opencl_kernels + * See the docs for \link kernels/add.hpp add_batch() \endlink + */ +const kernel_cl + indexing_rev_local_atomic("indexing_rev", + {atomic_add_double_device_function, + indexing_rev_local_atomic_kernel_code}); + +// \cond +static const std::string indexing_rev_local_independent_kernel_code = STRINGIFY( + // \endcond + /** \ingroup opencl_kernels + * + * Increments adjoint of the indexing operation argument given the indices + * and adjoints of the indexing result. + * + * This kernel makes each thread build its own copy of the adjoints before + * combining them. It is the fastest (and only works for) small size of the + * indexed matrix. + * + * @param[in,out] adj adjoint to increment + * @param index + * @param res adjoint of the result of indexing + * @param batch_size Number of matrices in the batch. + * @note Code is a const char* held in + * add_batch_kernel_code. + */ + __kernel void indexing_rev(__global double* adj, const __global int* index, + const __global double* res, + __local double* adj_loc, int index_size, + int adj_size) { + const int gid = get_global_id(0); + const int lid = get_local_id(0); + const int gsize = get_global_size(0); + const int lsize = get_local_size(0); + for (int i = lid; i < adj_size * lsize; i += lsize) { + adj_loc[i] = 0; + } + barrier(CLK_LOCAL_MEM_FENCE); + for (int i = gid; i < index_size; i += gsize) { + adj_loc[index[i] + lid * adj_size] += res[i]; + } + barrier(CLK_LOCAL_MEM_FENCE); + for (int i = lid; i < adj_size; i += lsize) { + double p = adj_loc[i + adj_size]; + for (int j = 2; j < lsize; j++) { + p += adj_loc[i + j * adj_size]; + } + adj_loc[i] += p; + } + barrier(CLK_LOCAL_MEM_FENCE); + for (int i = lid; i < adj_size; i += lsize) { + atomic_add_double(adj + i, adj_loc[i]); + } + } + // \cond +); +// \endcond + +/** \ingroup opencl_kernels + * See the docs for \link kernels/add.hpp add_batch() \endlink + */ +const kernel_cl + indexing_rev_local_independent( + "indexing_rev", {atomic_add_double_device_function, + indexing_rev_local_independent_kernel_code}); +} // namespace opencl_kernels +} // namespace math +} // namespace stan +#endif +#endif diff --git a/stan/math/opencl/matrix_cl.hpp b/stan/math/opencl/matrix_cl.hpp index e4949d69761..b12cd6bd4f4 100644 --- a/stan/math/opencl/matrix_cl.hpp +++ b/stan/math/opencl/matrix_cl.hpp @@ -31,7 +31,7 @@ namespace math { * @{ */ -template +template class matrix_cl; /** @@ -39,7 +39,7 @@ class matrix_cl; * @tparam T an arithmetic type for the type stored in the OpenCL buffer. */ template -class matrix_cl> : public matrix_cl_base { +class matrix_cl : public matrix_cl_base { private: cl::Buffer buffer_cl_; // Holds the allocated memory on the device int rows_{0}; // Number of rows. @@ -624,13 +624,6 @@ class matrix_cl> : public matrix_cl_base { delete static_cast(container); } }; - -template -using matrix_cl_prim = matrix_cl>; - -template -using matrix_cl_fp = matrix_cl>; - /** @}*/ } // namespace math diff --git a/stan/math/opencl/rev/vari.hpp b/stan/math/opencl/rev/vari.hpp index 5f5ef03165c..cd35d14c0f0 100644 --- a/stan/math/opencl/rev/vari.hpp +++ b/stan/math/opencl/rev/vari.hpp @@ -108,6 +108,27 @@ class vari_cl_base : public vari_base { std::move(adj_t)); } + /** + * Return indexed view into a matrix. + * + * Do not use with indices that reference any element of this more than once - + * that cans cause data races in rev operations on the result! + * + * @param row_index kg expression used for row index + * @param col_index kg expression used for column index + */ + template + auto index(const RowIndex& row_index, const ColIndex& col_index) { + RowIndex r1 = row_index; + RowIndex r2 = row_index; + ColIndex c1 = col_index; + ColIndex c2 = col_index; + auto&& val_t = stan::math::indexing(val_, std::move(r1), std::move(c1)); + auto&& adj_t = stan::math::indexing(adj_, std::move(r2), std::move(c2)); + return vari_view>(std::move(val_t), + std::move(adj_t)); + } + /** * Return the number of rows for this class's `val_` member */ diff --git a/stan/math/opencl/zeros_strict_tri.hpp b/stan/math/opencl/zeros_strict_tri.hpp index 66ff85c64f4..30770a7aef9 100644 --- a/stan/math/opencl/zeros_strict_tri.hpp +++ b/stan/math/opencl/zeros_strict_tri.hpp @@ -29,7 +29,7 @@ namespace math { */ template template -inline void matrix_cl>::zeros_strict_tri() try { +inline void matrix_cl::zeros_strict_tri() try { if (matrix_view == matrix_cl_view::Entire) { invalid_argument( "zeros_strict_tri", "matrix_view", diff --git a/stan/math/prim/meta/is_kernel_expression.hpp b/stan/math/prim/meta/is_kernel_expression.hpp index 8dde7061598..1063302f41a 100644 --- a/stan/math/prim/meta/is_kernel_expression.hpp +++ b/stan/math/prim/meta/is_kernel_expression.hpp @@ -76,15 +76,20 @@ struct is_kernel_expression_lhs template struct is_kernel_expression_lhs> : std::true_type {}; +/** + * Determines whether a type is a var containing a kernel generator expression. + */ +template +struct is_rev_kernel_expression + : math::conjunction, is_kernel_expression>> {}; + /** * Determines whether a type is either a kernel generator * expression or a var containing a kernel generator expression. */ template struct is_prim_or_rev_kernel_expression - : math::disjunction< - is_kernel_expression, - math::conjunction, is_kernel_expression>>> { + : math::disjunction, is_rev_kernel_expression> { }; /** @@ -101,6 +106,8 @@ struct is_nonscalar_prim_or_rev_kernel_expression /** @}*/ STAN_ADD_REQUIRE_UNARY(kernel_expression_lhs, is_kernel_expression_lhs, opencl_kernel_generator); +STAN_ADD_REQUIRE_UNARY(rev_kernel_expression, is_rev_kernel_expression, + opencl_kernel_generator); STAN_ADD_REQUIRE_UNARY(prim_or_rev_kernel_expression, is_prim_or_rev_kernel_expression, opencl_kernel_generator); diff --git a/stan/math/rev/core/var.hpp b/stan/math/rev/core/var.hpp index 26c78556d5b..0c7ade74951 100644 --- a/stan/math/rev/core/var.hpp +++ b/stan/math/rev/core/var.hpp @@ -401,7 +401,7 @@ class var_value< * @return The value of this variable. */ inline const auto& val() const { return vi_->val(); } - inline auto& val_op() { return vi_->val(); } + inline auto& val_op() { return vi_->val_op(); } /** * Return a reference to the derivative of the root expression with diff --git a/test/unit/math/opencl/double_d_test.cpp b/test/unit/math/opencl/double_d_test.cpp new file mode 100644 index 00000000000..ffbad5e7ff2 --- /dev/null +++ b/test/unit/math/opencl/double_d_test.cpp @@ -0,0 +1,177 @@ +#include +#include +#include +#include + +#define EXPECT_NORMALIZED(a) \ + EXPECT_LT(std::abs(a.low), \ + std::abs(a.high) * std::numeric_limits::epsilon()); + +TEST(double_d, add_dd_dd_test) { + using stan::math::internal::add_dd_dd; + using stan::math::internal::double_d; + double eps = std::numeric_limits::epsilon(); + double h_eps = eps * 0.5; + double_d a{1.0, h_eps}; + double_d c{h_eps, 0.0}; + double_d d{-1.0, -h_eps + eps * h_eps}; + + // simple + double_d res = add_dd_dd(a, a); + EXPECT_NORMALIZED(res); + EXPECT_EQ(res.high, 2.0); + EXPECT_EQ(res.low, eps); + + // carry + res = add_dd_dd(a, c); + EXPECT_NORMALIZED(res); + EXPECT_EQ(res.high, 1.0 + eps); + EXPECT_EQ(res.low, 0.0); + + // cancelation + res = add_dd_dd(a, d); + EXPECT_NORMALIZED(res); + EXPECT_EQ(res.high, eps * h_eps); + EXPECT_EQ(res.low, 0.0); +} + +TEST(double_d, mul_dd_dd_test) { + using stan::math::internal::double_d; + using stan::math::internal::mul_dd_dd; + double eps = std::numeric_limits::epsilon(); + double h_eps = eps * 0.5; + double_d a{1.0, h_eps * 0.001}; + double_d b{h_eps, h_eps * h_eps * h_eps * h_eps}; + double_d c{1.0, h_eps}; + double_d d{1.0, -h_eps}; + + // simple + double_d res = mul_dd_dd(a, b); + EXPECT_NORMALIZED(res); + EXPECT_EQ(res.high, h_eps); + EXPECT_EQ(res.low, h_eps * h_eps * 0.001); + + // carry + res = mul_dd_dd(c, c); + EXPECT_NORMALIZED(res); + EXPECT_EQ(res.high, 1.0 + eps); + EXPECT_EQ(res.low, 0.0); + + // cancelation + res = mul_dd_dd(c, d); + EXPECT_NORMALIZED(res); + EXPECT_EQ(res.high, 1.0); + EXPECT_EQ(res.low, 0.0); +} + +TEST(double_d, div_dd_dd_test) { + using stan::math::internal::div_dd_dd; + using stan::math::internal::double_d; + double eps = std::numeric_limits::epsilon(); + double h_eps = eps * 0.5; + double_d a{1.0, h_eps}; + double_d b{1.0, h_eps * (1 - eps)}; + double_d c{0.0, 0.0}; + double_d d{1.0, -h_eps}; + + // simple + double_d res = div_dd_dd(a, a); + EXPECT_NORMALIZED(res); + EXPECT_EQ(res.high, 1.0); + EXPECT_EQ(res.low, 0.0); + + res = div_dd_dd(a, b); + EXPECT_NORMALIZED(res); + EXPECT_EQ(res.high, 1.0); + EXPECT_EQ(res.low, h_eps * eps); + + // div by zero + res = div_dd_dd(a, c); + EXPECT_EQ(res.high, std::numeric_limits::infinity()); + EXPECT_EQ(res.low, 0); +} + +// this is the best test we have, but it relies on the nonstandard gcc extension +// quadmath, so it is not run by default +#ifdef GCC_QUADMATH_TEST +#include + +#define EXPECT_DD_F128_EQ(dd, f128) \ + tmp = (dd); \ + EXPECT_NEAR((__float128)tmp.high + tmp.low, (f128), 1e-30 * 1e30) + +TEST(double_d, all) { + using stan::math::internal::double_d; + for (int i = 0; i < 10000000; i++) { + double_d tmp; + double high = Eigen::MatrixXd::Random(0, 0).coeff(0, 0) * 1e30; + double low = Eigen::MatrixXd::Random(0, 0).coeff(0, 0) * 1e-20 * 1e30; + double_d dd = high; + dd.low = low; + __float128 f128 = high; + f128 += low; + EXPECT_DD_F128_EQ(dd, f128); + + double high2 = Eigen::MatrixXd::Random(0, 0).coeff(0, 0); + double low2 = Eigen::MatrixXd::Random(0, 0).coeff(0, 0) * 1e-20; + double_d dd2 = high2; + dd2.low = low2; + __float128 f1282 = high2; + f1282 += low2; + EXPECT_DD_F128_EQ(dd2, f1282); + + EXPECT_DD_F128_EQ(dd + dd2, f128 + f1282); + EXPECT_DD_F128_EQ(dd - dd2, f128 - f1282); + EXPECT_DD_F128_EQ(dd * dd2, f128 * f1282); + EXPECT_DD_F128_EQ(dd / dd2, f128 / f1282); + + __float128 f1283 = high; + __float128 f1284 = high2; + EXPECT_DD_F128_EQ(stan::math::internal::mul_d_d(high, high2), + f1283 * f1284); + } +} +#endif + +#ifdef STAN_OPENCL + +#include +static const std::string double_d_test_kernel_code + = STRINGIFY(__kernel void double_d_test_kernel(__global double_d *C, + const __global double_d *A, + const __global double *B) { + const int i = get_global_id(0); + C[i] = mul_dd_d(A[i], B[i]); + }); + +const stan::math::opencl_kernels::kernel_cl< + stan::math::opencl_kernels::out_buffer, + stan::math::opencl_kernels::in_buffer, + stan::math::opencl_kernels::in_buffer> + double_d_test_kernel("double_d_test_kernel", + {stan::math::internal::double_d_src, + double_d_test_kernel_code}); + +TEST(double_d, opencl) { + using stan::math::internal::double_d; + using VectorXdd = Eigen::Matrix; + int n = 10; + VectorXdd a(n); + Eigen::VectorXd b = Eigen::VectorXd::Random(n); + for (int i = 0; i < n; i++) { + a[i].high = Eigen::VectorXd::Random(1)[0]; + a[i].low = Eigen::VectorXd::Random(1)[0] * 1e-17; + } + stan::math::matrix_cl a_cl(a); + stan::math::matrix_cl b_cl(b); + stan::math::matrix_cl c_cl(n, 1); + double_d_test_kernel(cl::NDRange(n), c_cl, a_cl, b_cl); + VectorXdd c = stan::math::from_matrix_cl(c_cl); + for (int i = 0; i < n; i++) { + double_d correct = a[i] * b[i]; + EXPECT_EQ(c[i].high, correct.high); + EXPECT_NEAR(c[i].low, correct.low, 1e-30); + } +} + +#endif diff --git a/test/unit/math/opencl/indexing_rev_test.cpp b/test/unit/math/opencl/indexing_rev_test.cpp new file mode 100644 index 00000000000..d7f1c40400f --- /dev/null +++ b/test/unit/math/opencl/indexing_rev_test.cpp @@ -0,0 +1,50 @@ +#ifdef STAN_OPENCL +#include +#include +#include +#include +#include +#include + +TEST(indexing_rev, indexing_rev_small) { + Eigen::MatrixXd res(3, 3); + res << 1, 2, 3, 4, 5, 6, 7, 8, 9; + Eigen::MatrixXi idx(3, 3); + idx << 1, 2, 1, 0, 1, 2, 1, 1, 1; + Eigen::MatrixXd adj = Eigen::MatrixXd::Zero(3, 1); + Eigen::MatrixXd correct(3, 1); + correct << 4, 33, 8; + stan::math::matrix_cl res_cl(res); + stan::math::matrix_cl idx_cl(idx); + stan::math::matrix_cl adj_cl(adj); + + stan::math::indexing_rev(adj_cl, idx_cl, res_cl); + EXPECT_NEAR_REL(stan::math::from_matrix_cl(adj_cl), correct); +} + +TEST(indexing_rev, indexing_rev_large) { + int N = 377; + + // different sizes of adj ensure all three kernels are tested regardless of + // the OpenCL device + for (int M = 1; M < 1e6; M *= 2) { + Eigen::MatrixXd res = Eigen::MatrixXd::Random(N, N); + Eigen::MatrixXi idx(N, N); + for (int i = 0; i < N * N; i++) { + idx(i) = std::abs(Eigen::MatrixXi::Random(1, 1)(0)) % M; + } + Eigen::MatrixXd adj = Eigen::MatrixXd::Zero(M, 1); + Eigen::MatrixXd correct = adj; + for (int i = 0; i < N * N; i++) { + correct(idx(i)) += res(i); + } + stan::math::matrix_cl res_cl(res); + stan::math::matrix_cl idx_cl(idx); + stan::math::matrix_cl adj_cl(adj); + + stan::math::indexing_rev(adj_cl, idx_cl, res_cl); + EXPECT_NEAR_REL(stan::math::from_matrix_cl(adj_cl), correct); + } +} + +#endif