From 8099c90fb61dd96326be00be969fe5ceb520504c Mon Sep 17 00:00:00 2001 From: Sebastian Weber Date: Mon, 30 Dec 2019 17:27:05 +0100 Subject: [PATCH 001/189] prim version --- stan/math/prim/scal.hpp | 2 + .../prim/scal/functor/parallel_reduce_sum.hpp | 79 +++++++++++++++++++ .../functor/parallel_v3_reduce_sum_test.cpp | 69 ++++++++++++++++ 3 files changed, 150 insertions(+) create mode 100644 stan/math/prim/scal/functor/parallel_reduce_sum.hpp create mode 100644 test/unit/math/rev/scal/functor/parallel_v3_reduce_sum_test.cpp diff --git a/stan/math/prim/scal.hpp b/stan/math/prim/scal.hpp index 1784f714688..ea4700b4722 100644 --- a/stan/math/prim/scal.hpp +++ b/stan/math/prim/scal.hpp @@ -131,6 +131,8 @@ #include #include +#include + #include #include #include diff --git a/stan/math/prim/scal/functor/parallel_reduce_sum.hpp b/stan/math/prim/scal/functor/parallel_reduce_sum.hpp new file mode 100644 index 00000000000..4a00db87de7 --- /dev/null +++ b/stan/math/prim/scal/functor/parallel_reduce_sum.hpp @@ -0,0 +1,79 @@ +#ifndef STAN_MATH_PRIM_SCAL_FUNCTOR_PARALLEL_REDUCE_SUM_HPP +#define STAN_MATH_PRIM_SCAL_FUNCTOR_PARALLEL_REDUCE_SUM_HPP + +#include +#include + +#include + +#include +#include +#include + +#include +#include +#include + +namespace stan { +namespace math { + +namespace internal { + +// base definition => compile error +template +struct parallel_reduce_sum_impl {}; + +template +struct parallel_reduce_sum_impl { + struct recursive_reducer { + InputIt first_; + const T& init_; + T sum_; + typedef typename std::iterator_traits::value_type elem_t; + + recursive_reducer(InputIt first, const T& init) + : first_(first), init_(init), sum_(init) {} + + recursive_reducer(recursive_reducer& other, tbb::split) + : first_(other.first_), init_(other.init_), sum_(init_) {} + + void operator()(const tbb::blocked_range& r) { + if (r.empty()) + return; + + auto start = first_; + std::advance(start, r.begin()); + auto end = first_; + std::advance(end, r.end()); + + std::vector sub_slice(start, end); + + sum_ += ReduceFunction()(r.begin(), r.end() - 1, sub_slice); + } + void join(const recursive_reducer& child) { sum_ += child.sum_; } + }; + + T operator()(InputIt first, InputIt last, T init, + std::size_t grainsize) const { + const std::size_t num_jobs = std::distance(first, last); + recursive_reducer worker(first, init); + tbb::parallel_reduce( + tbb::blocked_range(0, num_jobs, grainsize), worker); + return std::move(worker.sum_); + } +}; +} // namespace internal + +template +constexpr T parallel_reduce_sum(InputIt first, InputIt last, T init, + std::size_t grainsize = 1) { + typedef T return_base_t; + return internal::parallel_reduce_sum_impl()(first, last, init, + grainsize); +} + +} // namespace math +} // namespace stan + +#endif diff --git a/test/unit/math/rev/scal/functor/parallel_v3_reduce_sum_test.cpp b/test/unit/math/rev/scal/functor/parallel_v3_reduce_sum_test.cpp new file mode 100644 index 00000000000..d7e1e3f4eca --- /dev/null +++ b/test/unit/math/rev/scal/functor/parallel_v3_reduce_sum_test.cpp @@ -0,0 +1,69 @@ +#include +#include +#include +#include +#include +#include + +// reduce functor which is the BinaryFunction +// here we use iterators which represent integer indices +template +struct count_lpdf { + count_lpdf() {} + + // does the reduction in the sub-slice start to end + inline T operator()(std::size_t start, std::size_t end, + std::vector sub_slice) const { + T lambda = 10.0; + return stan::math::poisson_lpmf(sub_slice, lambda); + } +}; + +TEST(parallel_v3_reduce_sum, gradient) { + double lambda_d = 10.0; + const std::size_t elems = 10000; + const std::size_t num_iter = 1000; + std::vector data(elems); + + for (std::size_t i = 0; i != elems; ++i) + data[i] = i; + + typedef boost::counting_iterator count_iter; + + double poisson_lpdf = stan::math::parallel_reduce_sum>( + data.begin(), data.end(), 0.0); + + double poisson_lpdf_ref = stan::math::poisson_lpmf(data, lambda_d); + + EXPECT_FLOAT_EQ(poisson_lpdf, poisson_lpdf_ref); +} + +/* +TEST(parallel_v3_reduce_sum, gradient) { + double lambda_d = 10.0; + const std::size_t elems = 10000; + const std::size_t num_iter = 1000; + std::vector data(elems); + + for (std::size_t i = 0; i != elems; ++i) + data[i] = i; + + typedef boost::counting_iterator count_iter; + using stan::math::var; + + count_lpdf reduce_op; + + var poisson_lpdf = stan::math::parallel_reduce_sum( + count_iter(0), count_iter(elems), var(0.0), reduce_op); + + var lambda_ref = lambda_d; + var poisson_lpdf_ref = stan::math::poisson_lpmf(data, lambda_ref); + + EXPECT_FLOAT_EQ(value_of(poisson_lpdf), value_of(poisson_lpdf_ref)); + + stan::math::grad(poisson_lpdf.vi_); + stan::math::grad(poisson_lpdf_ref.vi_); + + EXPECT_FLOAT_EQ(poisson_lpdf.adj(), poisson_lpdf_ref.adj()); +} +*/ From 7027c12b6b888948168c46039aad029ce67c165b Mon Sep 17 00:00:00 2001 From: Sebastian Weber Date: Mon, 30 Dec 2019 19:35:05 +0100 Subject: [PATCH 002/189] make nested stuff work --- .../prim/scal/functor/parallel_reduce_sum.hpp | 33 +++--- stan/math/rev/scal.hpp | 2 + .../rev/scal/functor/parallel_reduce_sum.hpp | 107 ++++++++++++++++++ .../functor/parallel_v3_reduce_sum_test.cpp | 39 +++++-- 4 files changed, 156 insertions(+), 25 deletions(-) create mode 100644 stan/math/rev/scal/functor/parallel_reduce_sum.hpp diff --git a/stan/math/prim/scal/functor/parallel_reduce_sum.hpp b/stan/math/prim/scal/functor/parallel_reduce_sum.hpp index 4a00db87de7..50bca41c251 100644 --- a/stan/math/prim/scal/functor/parallel_reduce_sum.hpp +++ b/stan/math/prim/scal/functor/parallel_reduce_sum.hpp @@ -20,22 +20,27 @@ namespace math { namespace internal { // base definition => compile error -template +template struct parallel_reduce_sum_impl {}; -template -struct parallel_reduce_sum_impl { +template +struct parallel_reduce_sum_impl { struct recursive_reducer { InputIt first_; const T& init_; + Arg1& arg1_; T sum_; typedef typename std::iterator_traits::value_type elem_t; - recursive_reducer(InputIt first, const T& init) - : first_(first), init_(init), sum_(init) {} + recursive_reducer(InputIt first, const T& init, Arg1& arg1) + : first_(first), init_(init), arg1_(arg1), sum_(init) {} recursive_reducer(recursive_reducer& other, tbb::split) - : first_(other.first_), init_(other.init_), sum_(init_) {} + : first_(other.first_), + init_(other.init_), + arg1_(other.arg1_), + sum_(init_) {} void operator()(const tbb::blocked_range& r) { if (r.empty()) @@ -48,15 +53,15 @@ struct parallel_reduce_sum_impl { std::vector sub_slice(start, end); - sum_ += ReduceFunction()(r.begin(), r.end() - 1, sub_slice); + sum_ += ReduceFunction()(r.begin(), r.end() - 1, sub_slice, arg1_); } void join(const recursive_reducer& child) { sum_ += child.sum_; } }; - T operator()(InputIt first, InputIt last, T init, - std::size_t grainsize) const { + T operator()(InputIt first, InputIt last, T init, std::size_t grainsize, + Arg1& arg1) const { const std::size_t num_jobs = std::distance(first, last); - recursive_reducer worker(first, init); + recursive_reducer worker(first, init, arg1); tbb::parallel_reduce( tbb::blocked_range(0, num_jobs, grainsize), worker); return std::move(worker.sum_); @@ -64,13 +69,13 @@ struct parallel_reduce_sum_impl { }; } // namespace internal -template +template constexpr T parallel_reduce_sum(InputIt first, InputIt last, T init, - std::size_t grainsize = 1) { + std::size_t grainsize, Arg1& arg1) { typedef T return_base_t; - return internal::parallel_reduce_sum_impl()(first, last, init, - grainsize); + grainsize, arg1); } } // namespace math diff --git a/stan/math/rev/scal.hpp b/stan/math/rev/scal.hpp index ff4a91c1f79..2ac1387a702 100644 --- a/stan/math/rev/scal.hpp +++ b/stan/math/rev/scal.hpp @@ -102,4 +102,6 @@ #include #include +#include + #endif diff --git a/stan/math/rev/scal/functor/parallel_reduce_sum.hpp b/stan/math/rev/scal/functor/parallel_reduce_sum.hpp new file mode 100644 index 00000000000..7bf2a7b3db9 --- /dev/null +++ b/stan/math/rev/scal/functor/parallel_reduce_sum.hpp @@ -0,0 +1,107 @@ +#ifndef STAN_MATH_REV_SCAL_FUNCTOR_PARALLEL_REDUCE_SUM_HPP +#define STAN_MATH_REV_SCAL_FUNCTOR_PARALLEL_REDUCE_SUM_HPP + +#include +#include + +#include + +#include +#include +#include + +#include +#include +#include + +namespace stan { +namespace math { + +namespace internal { + +template +struct parallel_reduce_sum_impl { + using ops_partials_t = operands_and_partials; + + struct recursive_reducer { + InputIt first_; + const T& init_; + Arg1& arg1_; + std::vector terms_adjs_; + std::vector terms_vals_; + typedef typename std::iterator_traits::value_type elem_t; + + recursive_reducer(InputIt first, const T& init, Arg1& arg1) + : first_(first), + init_(init), + arg1_(arg1), + terms_adjs_(1, 0.0), + terms_vals_(1, init_.val()) {} + + recursive_reducer(recursive_reducer& other, tbb::split) + : first_(other.first_), + init_(other.init_), + arg1_(other.arg1_), + terms_adjs_(1, 0.0), + terms_vals_(1, init_.val()) {} + // terms_adjs_(other.terms_adjs_), terms_vals_(other.terms_vals_) {} + + void operator()(const tbb::blocked_range& r) { + if (r.empty()) + return; + + auto start = first_; + std::advance(start, r.begin()); + auto end = first_; + std::advance(end, r.end()); + + try { + start_nested(); + + // elem_t should better be a non-var!!! Otherwise we interfere + // with the outer AD tree as we mess with the adjoints (and + // right now we discard that it is a var if it would be) + std::vector sub_slice(start, end); + + // create a copy of arg1 which is not tied to the out AD tree + Arg1 arg1(var(new vari(arg1_.val(), false))); + + T sub_sum_v = ReduceFunction()(r.begin(), r.end() - 1, sub_slice, arg1); + + sub_sum_v.grad(); + + terms_vals_.push_back(sub_sum_v.val()); + terms_adjs_.push_back(arg1.adj()); + + } catch (const std::exception& e) { + recover_memory_nested(); + throw; + } + recover_memory_nested(); + } + void join(const recursive_reducer& child) { + terms_adjs_.insert(terms_adjs_.end(), child.terms_adjs_.begin(), + child.terms_adjs_.end()); + terms_vals_.insert(terms_vals_.end(), child.terms_vals_.begin(), + child.terms_vals_.end()); + } + }; + + T operator()(InputIt first, InputIt last, T init, std::size_t grainsize, + Arg1& arg1) const { + const std::size_t num_jobs = std::distance(first, last); + recursive_reducer worker(first, init, arg1); + tbb::parallel_reduce( + tbb::blocked_range(0, num_jobs, grainsize), worker); + std::vector terms(worker.terms_adjs_.size()); + ops_partials_t ops_sum(arg1); + ops_sum.edge1_.partials_[0] = sum(worker.terms_adjs_); + return ops_sum.build(sum(worker.terms_vals_)); + } +}; +} // namespace internal + +} // namespace math +} // namespace stan + +#endif diff --git a/test/unit/math/rev/scal/functor/parallel_v3_reduce_sum_test.cpp b/test/unit/math/rev/scal/functor/parallel_v3_reduce_sum_test.cpp index d7e1e3f4eca..6e9824cf989 100644 --- a/test/unit/math/rev/scal/functor/parallel_v3_reduce_sum_test.cpp +++ b/test/unit/math/rev/scal/functor/parallel_v3_reduce_sum_test.cpp @@ -1,3 +1,4 @@ +#include #include #include #include @@ -13,13 +14,14 @@ struct count_lpdf { // does the reduction in the sub-slice start to end inline T operator()(std::size_t start, std::size_t end, - std::vector sub_slice) const { - T lambda = 10.0; + std::vector sub_slice, T& lambda) const { return stan::math::poisson_lpmf(sub_slice, lambda); } }; -TEST(parallel_v3_reduce_sum, gradient) { +TEST(parallel_v3_reduce_sum, value) { + stan::math::init_threadpool_tbb(); + double lambda_d = 10.0; const std::size_t elems = 10000; const std::size_t num_iter = 1000; @@ -31,15 +33,16 @@ TEST(parallel_v3_reduce_sum, gradient) { typedef boost::counting_iterator count_iter; double poisson_lpdf = stan::math::parallel_reduce_sum>( - data.begin(), data.end(), 0.0); + data.begin(), data.end(), 0.0, 5, lambda_d); double poisson_lpdf_ref = stan::math::poisson_lpmf(data, lambda_d); EXPECT_FLOAT_EQ(poisson_lpdf, poisson_lpdf_ref); } -/* TEST(parallel_v3_reduce_sum, gradient) { + stan::math::init_threadpool_tbb(); + double lambda_d = 10.0; const std::size_t elems = 10000; const std::size_t num_iter = 1000; @@ -51,19 +54,33 @@ TEST(parallel_v3_reduce_sum, gradient) { typedef boost::counting_iterator count_iter; using stan::math::var; - count_lpdf reduce_op; + var lambda_v = lambda_d; - var poisson_lpdf = stan::math::parallel_reduce_sum( - count_iter(0), count_iter(elems), var(0.0), reduce_op); + var poisson_lpdf = stan::math::parallel_reduce_sum>( + data.begin(), data.end(), var(0.0), 5, lambda_v); var lambda_ref = lambda_d; var poisson_lpdf_ref = stan::math::poisson_lpmf(data, lambda_ref); EXPECT_FLOAT_EQ(value_of(poisson_lpdf), value_of(poisson_lpdf_ref)); - stan::math::grad(poisson_lpdf.vi_); stan::math::grad(poisson_lpdf_ref.vi_); + const double lambda_ref_adj = lambda_ref.adj(); + + stan::math::set_zero_all_adjoints(); + stan::math::grad(poisson_lpdf.vi_); + const double lambda_adj = lambda_v.adj(); + + EXPECT_FLOAT_EQ(lambda_adj, lambda_ref_adj); + + std::cout << "ref value of poisson lpdf : " << poisson_lpdf_ref.val() + << std::endl; + ; + std::cout << "ref gradient wrt to lambda: " << lambda_ref_adj << std::endl; + ; - EXPECT_FLOAT_EQ(poisson_lpdf.adj(), poisson_lpdf_ref.adj()); + std::cout << "value of poisson lpdf : " << poisson_lpdf.val() << std::endl; + ; + std::cout << "gradient wrt to lambda: " << lambda_adj << std::endl; + ; } -*/ From 302cd71850cb29f1de585e0a53a4f5cb100f0e42 Mon Sep 17 00:00:00 2001 From: Sebastian Weber Date: Tue, 31 Dec 2019 10:29:54 +0100 Subject: [PATCH 003/189] extend signature and test nested parallel AD --- .../prim/scal/functor/parallel_reduce_sum.hpp | 30 +++-- .../rev/scal/functor/parallel_reduce_sum.hpp | 61 +++++---- .../functor/parallel_v3_reduce_sum_test.cpp | 119 +++++++++++++++++- 3 files changed, 170 insertions(+), 40 deletions(-) diff --git a/stan/math/prim/scal/functor/parallel_reduce_sum.hpp b/stan/math/prim/scal/functor/parallel_reduce_sum.hpp index 50bca41c251..6b613132012 100644 --- a/stan/math/prim/scal/functor/parallel_reduce_sum.hpp +++ b/stan/math/prim/scal/functor/parallel_reduce_sum.hpp @@ -28,19 +28,20 @@ template struct parallel_reduce_sum_impl { struct recursive_reducer { InputIt first_; - const T& init_; - Arg1& arg1_; + std::vector& varg1_; + const std::vector& idata1_; T sum_; typedef typename std::iterator_traits::value_type elem_t; - recursive_reducer(InputIt first, const T& init, Arg1& arg1) - : first_(first), init_(init), arg1_(arg1), sum_(init) {} + recursive_reducer(InputIt first, const T& init, std::vector& varg1, + const std::vector& idata1) + : first_(first), varg1_(varg1), idata1_(idata1), sum_(init) {} recursive_reducer(recursive_reducer& other, tbb::split) : first_(other.first_), - init_(other.init_), - arg1_(other.arg1_), - sum_(init_) {} + varg1_(other.varg1_), + idata1_(other.idata1_), + sum_(T(0.0)) {} void operator()(const tbb::blocked_range& r) { if (r.empty()) @@ -53,15 +54,17 @@ struct parallel_reduce_sum_impl { std::vector sub_slice(start, end); - sum_ += ReduceFunction()(r.begin(), r.end() - 1, sub_slice, arg1_); + sum_ += ReduceFunction()(r.begin(), r.end() - 1, sub_slice, varg1_, + idata1_); } + void join(const recursive_reducer& child) { sum_ += child.sum_; } }; T operator()(InputIt first, InputIt last, T init, std::size_t grainsize, - Arg1& arg1) const { + std::vector& varg1, const std::vector& idata1) const { const std::size_t num_jobs = std::distance(first, last); - recursive_reducer worker(first, init, arg1); + recursive_reducer worker(first, init, varg1, idata1); tbb::parallel_reduce( tbb::blocked_range(0, num_jobs, grainsize), worker); return std::move(worker.sum_); @@ -71,11 +74,12 @@ struct parallel_reduce_sum_impl { template constexpr T parallel_reduce_sum(InputIt first, InputIt last, T init, - std::size_t grainsize, Arg1& arg1) { + std::size_t grainsize, std::vector& varg1, + const std::vector& idata1) { typedef T return_base_t; return internal::parallel_reduce_sum_impl()(first, last, init, - grainsize, arg1); + return_base_t>()( + first, last, init, grainsize, varg1, idata1); } } // namespace math diff --git a/stan/math/rev/scal/functor/parallel_reduce_sum.hpp b/stan/math/rev/scal/functor/parallel_reduce_sum.hpp index 7bf2a7b3db9..c0f22dd6ea0 100644 --- a/stan/math/rev/scal/functor/parallel_reduce_sum.hpp +++ b/stan/math/rev/scal/functor/parallel_reduce_sum.hpp @@ -21,29 +21,30 @@ namespace internal { template struct parallel_reduce_sum_impl { - using ops_partials_t = operands_and_partials; + using ops_partials_t = operands_and_partials>; struct recursive_reducer { InputIt first_; - const T& init_; - Arg1& arg1_; - std::vector terms_adjs_; + std::vector& varg1_; + const std::vector& idata1_; + std::vector> terms_adjs_; std::vector terms_vals_; typedef typename std::iterator_traits::value_type elem_t; - recursive_reducer(InputIt first, const T& init, Arg1& arg1) + recursive_reducer(InputIt first, const T& init, std::vector& varg1, + const std::vector& idata1) : first_(first), - init_(init), - arg1_(arg1), - terms_adjs_(1, 0.0), - terms_vals_(1, init_.val()) {} + varg1_(varg1), + idata1_(idata1), + terms_adjs_(1, std::vector(varg1.size(), 0.0)), + terms_vals_(1, init.val()) {} recursive_reducer(recursive_reducer& other, tbb::split) : first_(other.first_), - init_(other.init_), - arg1_(other.arg1_), - terms_adjs_(1, 0.0), - terms_vals_(1, init_.val()) {} + varg1_(other.varg1_), + idata1_(other.idata1_), + terms_adjs_(), + terms_vals_() {} // terms_adjs_(other.terms_adjs_), terms_vals_(other.terms_vals_) {} void operator()(const tbb::blocked_range& r) { @@ -63,15 +64,25 @@ struct parallel_reduce_sum_impl { // right now we discard that it is a var if it would be) std::vector sub_slice(start, end); - // create a copy of arg1 which is not tied to the out AD tree - Arg1 arg1(var(new vari(arg1_.val(), false))); + // create a deep copy of arg1 which is not tied to the outer AD tree + std::vector varg1; + varg1.reserve(varg1_.size()); - T sub_sum_v = ReduceFunction()(r.begin(), r.end() - 1, sub_slice, arg1); + for (Arg1& elem : varg1_) + varg1.emplace_back(var(new vari(elem.val(), false))); + + T sub_sum_v = ReduceFunction()(r.begin(), r.end() - 1, sub_slice, varg1, + idata1_); sub_sum_v.grad(); terms_vals_.push_back(sub_sum_v.val()); - terms_adjs_.push_back(arg1.adj()); + + std::vector varg1_adj; + for (const Arg1& elem : varg1) + varg1_adj.push_back(elem.adj()); + + terms_adjs_.push_back(varg1_adj); } catch (const std::exception& e) { recover_memory_nested(); @@ -88,15 +99,19 @@ struct parallel_reduce_sum_impl { }; T operator()(InputIt first, InputIt last, T init, std::size_t grainsize, - Arg1& arg1) const { + std::vector& varg1, const std::vector& idata) const { const std::size_t num_jobs = std::distance(first, last); - recursive_reducer worker(first, init, arg1); + recursive_reducer worker(first, init, varg1, idata); tbb::parallel_reduce( tbb::blocked_range(0, num_jobs, grainsize), worker); - std::vector terms(worker.terms_adjs_.size()); - ops_partials_t ops_sum(arg1); - ops_sum.edge1_.partials_[0] = sum(worker.terms_adjs_); - return ops_sum.build(sum(worker.terms_vals_)); + ops_partials_t ops_sum(varg1); + double sum = 0.0; + for (std::size_t i = 0; i != worker.terms_adjs_.size(); ++i) { + sum += worker.terms_vals_[i]; + for (std::size_t j = 0; j != worker.terms_adjs_[i].size(); ++j) + ops_sum.edge1_.partials_[j] += worker.terms_adjs_[i][j]; + } + return ops_sum.build(sum); } }; } // namespace internal diff --git a/test/unit/math/rev/scal/functor/parallel_v3_reduce_sum_test.cpp b/test/unit/math/rev/scal/functor/parallel_v3_reduce_sum_test.cpp index 6e9824cf989..51ae3a1a0d3 100644 --- a/test/unit/math/rev/scal/functor/parallel_v3_reduce_sum_test.cpp +++ b/test/unit/math/rev/scal/functor/parallel_v3_reduce_sum_test.cpp @@ -14,8 +14,9 @@ struct count_lpdf { // does the reduction in the sub-slice start to end inline T operator()(std::size_t start, std::size_t end, - std::vector sub_slice, T& lambda) const { - return stan::math::poisson_lpmf(sub_slice, lambda); + std::vector& sub_slice, std::vector& lambda, + const std::vector& idata) const { + return stan::math::poisson_lpmf(sub_slice, lambda[0]); } }; @@ -30,14 +31,23 @@ TEST(parallel_v3_reduce_sum, value) { for (std::size_t i = 0; i != elems; ++i) data[i] = i; + std::vector idata; + std::vector vlambda_d(1, lambda_d); + typedef boost::counting_iterator count_iter; double poisson_lpdf = stan::math::parallel_reduce_sum>( - data.begin(), data.end(), 0.0, 5, lambda_d); + data.begin(), data.end(), 0.0, 5, vlambda_d, idata); double poisson_lpdf_ref = stan::math::poisson_lpmf(data, lambda_d); EXPECT_FLOAT_EQ(poisson_lpdf, poisson_lpdf_ref); + + std::cout << "ref value of poisson lpdf : " << poisson_lpdf_ref << std::endl; + ; + + std::cout << "value of poisson lpdf : " << poisson_lpdf << std::endl; + ; } TEST(parallel_v3_reduce_sum, gradient) { @@ -56,8 +66,107 @@ TEST(parallel_v3_reduce_sum, gradient) { var lambda_v = lambda_d; + std::vector idata; + std::vector vlambda_v(1, lambda_v); + var poisson_lpdf = stan::math::parallel_reduce_sum>( - data.begin(), data.end(), var(0.0), 5, lambda_v); + data.begin(), data.end(), var(0.0), 5, vlambda_v, idata); + + var lambda_ref = lambda_d; + var poisson_lpdf_ref = stan::math::poisson_lpmf(data, lambda_ref); + + EXPECT_FLOAT_EQ(value_of(poisson_lpdf), value_of(poisson_lpdf_ref)); + + stan::math::grad(poisson_lpdf_ref.vi_); + const double lambda_ref_adj = lambda_ref.adj(); + + stan::math::set_zero_all_adjoints(); + stan::math::grad(poisson_lpdf.vi_); + const double lambda_adj = lambda_v.adj(); + + EXPECT_FLOAT_EQ(lambda_adj, lambda_ref_adj); + + std::cout << "ref value of poisson lpdf : " << poisson_lpdf_ref.val() + << std::endl; + ; + std::cout << "ref gradient wrt to lambda: " << lambda_ref_adj << std::endl; + ; + + std::cout << "value of poisson lpdf : " << poisson_lpdf.val() << std::endl; + ; + std::cout << "gradient wrt to lambda: " << lambda_adj << std::endl; + ; + stan::math::recover_memory(); +} + +// ******************************** +// test if nested parallelism works +// ******************************** + +template +struct nesting_count_lpdf { + nesting_count_lpdf() {} + + // does the reduction in the sub-slice start to end + inline T operator()(std::size_t start, std::size_t end, + std::vector& sub_slice, std::vector& lambda, + const std::vector& idata) const { + return stan::math::parallel_reduce_sum>( + sub_slice.begin(), sub_slice.end(), T(0.0), 5, lambda, idata); + } +}; + +TEST(parallel_v3_reduce_sum, nesting_value) { + stan::math::init_threadpool_tbb(); + + double lambda_d = 10.0; + const std::size_t elems = 10000; + const std::size_t num_iter = 1000; + std::vector data(elems); + + for (std::size_t i = 0; i != elems; ++i) + data[i] = i; + + std::vector idata; + std::vector vlambda_d(1, lambda_d); + + typedef boost::counting_iterator count_iter; + + double poisson_lpdf = stan::math::parallel_reduce_sum>( + data.begin(), data.end(), 0.0, 5, vlambda_d, idata); + + double poisson_lpdf_ref = stan::math::poisson_lpmf(data, lambda_d); + + EXPECT_FLOAT_EQ(poisson_lpdf, poisson_lpdf_ref); + + std::cout << "ref value of poisson lpdf : " << poisson_lpdf_ref << std::endl; + ; + + std::cout << "value of poisson lpdf : " << poisson_lpdf << std::endl; + ; +} + +TEST(parallel_v3_reduce_sum, nesting_gradient) { + stan::math::init_threadpool_tbb(); + + double lambda_d = 10.0; + const std::size_t elems = 10000; + const std::size_t num_iter = 1000; + std::vector data(elems); + + for (std::size_t i = 0; i != elems; ++i) + data[i] = i; + + typedef boost::counting_iterator count_iter; + using stan::math::var; + + var lambda_v = lambda_d; + + std::vector idata; + std::vector vlambda_v(1, lambda_v); + + var poisson_lpdf = stan::math::parallel_reduce_sum>( + data.begin(), data.end(), var(0.0), 5, vlambda_v, idata); var lambda_ref = lambda_d; var poisson_lpdf_ref = stan::math::poisson_lpmf(data, lambda_ref); @@ -83,4 +192,6 @@ TEST(parallel_v3_reduce_sum, gradient) { ; std::cout << "gradient wrt to lambda: " << lambda_adj << std::endl; ; + + stan::math::recover_memory(); } From 9740b45928d07c6dbd13bf304d4ea22706f4cc4f Mon Sep 17 00:00:00 2001 From: Sebastian Weber Date: Tue, 31 Dec 2019 11:12:59 +0100 Subject: [PATCH 004/189] add hierarchical example --- .../functor/parallel_v3_reduce_sum_test.cpp | 83 +++++++++++++++++++ 1 file changed, 83 insertions(+) diff --git a/test/unit/math/rev/scal/functor/parallel_v3_reduce_sum_test.cpp b/test/unit/math/rev/scal/functor/parallel_v3_reduce_sum_test.cpp index 51ae3a1a0d3..d9616dcc150 100644 --- a/test/unit/math/rev/scal/functor/parallel_v3_reduce_sum_test.cpp +++ b/test/unit/math/rev/scal/functor/parallel_v3_reduce_sum_test.cpp @@ -195,3 +195,86 @@ TEST(parallel_v3_reduce_sum, nesting_gradient) { stan::math::recover_memory(); } + +// ******************************** +// basic performance test for a hierarchical model +// ******************************** + +template +struct grouped_count_lpdf { + grouped_count_lpdf() {} + + // does the reduction in the sub-slice start to end + inline T operator()(std::size_t start, std::size_t end, + std::vector& sub_slice, std::vector& lambda, + const std::vector& gidx) const { + const std::size_t num_terms = end - start + 1; + // std::cout << "sub-slice " << start << " - " << end << "; num_terms = " << + // num_terms << "; size = " << sub_slice.size() << std::endl; + std::vector lambda_slice(num_terms, 0.0); + for (std::size_t i = 0; i != num_terms; ++i) + lambda_slice[i] = lambda[gidx[start + i]]; + return stan::math::poisson_lpmf(sub_slice, lambda_slice); + } +}; + +TEST(parallel_v3_reduce_sum, grouped_gradient) { + stan::math::init_threadpool_tbb(); + + double lambda_d = 10.0; + const std::size_t groups = 10; + const std::size_t elems_per_group = 1000; + const std::size_t elems = groups * elems_per_group; + const std::size_t num_iter = 1000; + + std::vector data(elems); + std::vector gidx(elems); + + for (std::size_t i = 0; i != elems; ++i) { + data[i] = i; + gidx[i] = i / elems_per_group; + } + + using stan::math::var; + + std::vector vlambda_v; + + for (std::size_t i = 0; i != groups; ++i) + vlambda_v.push_back(i + 0.2); + + var lambda_v = vlambda_v[0]; + + var poisson_lpdf = stan::math::parallel_reduce_sum>( + data.begin(), data.end(), var(0.0), 5, vlambda_v, gidx); + + std::vector vref_lambda_v; + for (std::size_t i = 0; i != elems; ++i) { + vref_lambda_v.push_back(vlambda_v[gidx[i]]); + } + var lambda_ref = vlambda_v[0]; + + var poisson_lpdf_ref = stan::math::poisson_lpmf(data, vref_lambda_v); + + EXPECT_FLOAT_EQ(value_of(poisson_lpdf), value_of(poisson_lpdf_ref)); + + stan::math::grad(poisson_lpdf_ref.vi_); + const double lambda_ref_adj = lambda_ref.adj(); + + stan::math::set_zero_all_adjoints(); + stan::math::grad(poisson_lpdf.vi_); + const double lambda_adj = lambda_v.adj(); + + EXPECT_FLOAT_EQ(lambda_adj, lambda_ref_adj); + + std::cout << "ref value of poisson lpdf : " << poisson_lpdf_ref.val() + << std::endl; + ; + std::cout << "ref gradient wrt to lambda: " << lambda_ref_adj << std::endl; + ; + + std::cout << "value of poisson lpdf : " << poisson_lpdf.val() << std::endl; + ; + std::cout << "gradient wrt to lambda: " << lambda_adj << std::endl; + ; + stan::math::recover_memory(); +} From e47ca13518ddbff1f35dca1813a59f4f17dd5dae Mon Sep 17 00:00:00 2001 From: Sebastian Weber Date: Tue, 31 Dec 2019 14:51:29 +0100 Subject: [PATCH 005/189] add recover_memory_global --- .../prim/scal/functor/parallel_reduce_sum.hpp | 40 +++++++++++++++++++ stan/math/rev/core.hpp | 1 + stan/math/rev/core/init_chainablestack.hpp | 7 ++-- 3 files changed, 44 insertions(+), 4 deletions(-) diff --git a/stan/math/prim/scal/functor/parallel_reduce_sum.hpp b/stan/math/prim/scal/functor/parallel_reduce_sum.hpp index 6b613132012..415b925f8cd 100644 --- a/stan/math/prim/scal/functor/parallel_reduce_sum.hpp +++ b/stan/math/prim/scal/functor/parallel_reduce_sum.hpp @@ -72,6 +72,46 @@ struct parallel_reduce_sum_impl { }; } // namespace internal +/* + * The ReduceFunction is expected to have an operator with the + * signature + * + * inline T operator()(std::size_t start, std::size_t end, + * std::vector& sub_slice, std::vector& lambda, + * const std::vector& gidx) const + * + * So the input data is sliced into smaller pieces and given into the + * reducer. Which exact sub-slice is given to the function is defined + * by start and end index. These can be used to map data to groups, + * for example. It would be good to extend this signature in two ways: + * + * 1. The large std::vector which is sliced into smaller pieces should + * be allowed to be any data (so int, double, vectors, matrices, + * ...). In a second step this should also be extended to vars + * which will require more considerations, but is beneficial. One + * could leave this generalization to a future version. + * + * 2. The arguments past sub_slice should be variable. So the + * arguments past sub_slice would need to be represented by the + * "..." thing in C++. The complication for this is managing all + * the calls. + * + * So the parallel_reduce_sum signature should look like + * + * template + * constexpr T parallel_reduce_sum(InputIt first, InputIt last, T init, + * std::size_t grainsize, ...) + * + * which corresponds to a ReduceFunction which looks like + * + * inline T operator()(std::size_t start, std::size_t end, + * std::vector& sub_slice, ...) + * + * Note that the ReduceFunction is only passed in as type to prohibit + * that any internal state of the functor is causing trouble. Thus, + * the functor must be default constructible without any arguments. + * + */ template constexpr T parallel_reduce_sum(InputIt first, InputIt last, T init, std::size_t grainsize, std::vector& varg1, diff --git a/stan/math/rev/core.hpp b/stan/math/rev/core.hpp index db951f75576..b44f1cce469 100644 --- a/stan/math/rev/core.hpp +++ b/stan/math/rev/core.hpp @@ -43,6 +43,7 @@ #include #include #include +#include #include #include #include diff --git a/stan/math/rev/core/init_chainablestack.hpp b/stan/math/rev/core/init_chainablestack.hpp index 8f1b903cc9e..54646d85d66 100644 --- a/stan/math/rev/core/init_chainablestack.hpp +++ b/stan/math/rev/core/init_chainablestack.hpp @@ -53,16 +53,15 @@ class ad_tape_observer : public tbb::task_scheduler_observer { } } - private: ad_map thread_tape_map_; std::mutex thread_tape_map_mutex_; }; -namespace { +namespace internal { -ad_tape_observer global_observer; +static ad_tape_observer global_observer; -} // namespace +} // namespace internal } // namespace math } // namespace stan From 9565db93a1f3ab2430aa9b5ccaef1300b4b6f124 Mon Sep 17 00:00:00 2001 From: Sebastian Weber Date: Tue, 31 Dec 2019 14:52:37 +0100 Subject: [PATCH 006/189] add file --- stan/math/rev/core/recover_memory_global.hpp | 42 ++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 stan/math/rev/core/recover_memory_global.hpp diff --git a/stan/math/rev/core/recover_memory_global.hpp b/stan/math/rev/core/recover_memory_global.hpp new file mode 100644 index 00000000000..ef48078adf8 --- /dev/null +++ b/stan/math/rev/core/recover_memory_global.hpp @@ -0,0 +1,42 @@ +#ifndef STAN_MATH_REV_CORE_RECOVER_MEMORY_GLOBAL_HPP +#define STAN_MATH_REV_CORE_RECOVER_MEMORY_GLOBAL_HPP + +#include +#include +#include +#include + +namespace stan { +namespace math { + +/** + * Recover memory global used for all variables for reuse in all + * threads. + * + * @throw std::logic_error if empty_nested() returns + * false + */ +static inline void recover_memory_global() { + /* todo: check for each thread + if (!empty_nested()) { + throw std::logic_error( + "empty_nested() must be true" + " before calling recover_memory()"); + } + */ + + for (auto & [ thread, instance ] : + internal::global_observer.thread_tape_map) { + instance_->var_stack_.clear(); + instance_->var_nochain_stack_.clear(); + for (auto &x : instance_->var_alloc_stack_) { + delete x; + } + instance_->var_alloc_stack_.clear(); + instance_->memalloc_.recover_all(); + } +} + +} // namespace math +} // namespace stan +#endif From acbb3dcbec3eaae373a6abbc604adacebe1205fe Mon Sep 17 00:00:00 2001 From: Sebastian Weber Date: Tue, 31 Dec 2019 15:03:26 +0100 Subject: [PATCH 007/189] fix --- stan/math/rev/core/autodiffstackstorage.hpp | 5 ++++- stan/math/rev/core/recover_memory_global.hpp | 14 +++++++------- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/stan/math/rev/core/autodiffstackstorage.hpp b/stan/math/rev/core/autodiffstackstorage.hpp index 9009e1b0da0..dd7abf0a67e 100644 --- a/stan/math/rev/core/autodiffstackstorage.hpp +++ b/stan/math/rev/core/autodiffstackstorage.hpp @@ -90,7 +90,8 @@ struct AutodiffStackSingleton { using AutodiffStackSingleton_t = AutodiffStackSingleton; - AutodiffStackSingleton() : own_instance_(init()) {} + AutodiffStackSingleton() + : own_instance_(init()), active_instance_(instance_) {} ~AutodiffStackSingleton() { if (own_instance_) { delete instance_; @@ -117,6 +118,8 @@ struct AutodiffStackSingleton { static STAN_THREADS_DEF AutodiffStackStorage *instance_; + AutodiffStackStorage *active_instance_; + private: static bool init() { static STAN_THREADS_DEF bool is_initialized = false; diff --git a/stan/math/rev/core/recover_memory_global.hpp b/stan/math/rev/core/recover_memory_global.hpp index ef48078adf8..9c54d443dee 100644 --- a/stan/math/rev/core/recover_memory_global.hpp +++ b/stan/math/rev/core/recover_memory_global.hpp @@ -25,15 +25,15 @@ static inline void recover_memory_global() { } */ - for (auto & [ thread, instance ] : - internal::global_observer.thread_tape_map) { - instance_->var_stack_.clear(); - instance_->var_nochain_stack_.clear(); - for (auto &x : instance_->var_alloc_stack_) { + for (const auto& kv : internal::global_observer.thread_tape_map) { + ChainableStack& instance_ = *(kv.second->active_instance_); + instance_.var_stack_.clear(); + instance_.var_nochain_stack_.clear(); + for (auto& x : instance_.var_alloc_stack_) { delete x; } - instance_->var_alloc_stack_.clear(); - instance_->memalloc_.recover_all(); + instance_.var_alloc_stack_.clear(); + instance_.memalloc_.recover_all(); } } From bc2a6015d50d48727d316d24dd2122611b245c3d Mon Sep 17 00:00:00 2001 From: Sebastian Weber Date: Tue, 31 Dec 2019 15:23:21 +0100 Subject: [PATCH 008/189] fix --- stan/math/rev/core/autodiffstackstorage.hpp | 4 +++- stan/math/rev/core/recover_memory_global.hpp | 21 +++++++++++++------- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/stan/math/rev/core/autodiffstackstorage.hpp b/stan/math/rev/core/autodiffstackstorage.hpp index dd7abf0a67e..860fb5de95a 100644 --- a/stan/math/rev/core/autodiffstackstorage.hpp +++ b/stan/math/rev/core/autodiffstackstorage.hpp @@ -91,7 +91,9 @@ struct AutodiffStackSingleton { = AutodiffStackSingleton; AutodiffStackSingleton() - : own_instance_(init()), active_instance_(instance_) {} + : own_instance_(init()), active_instance_(nullptr) { + active_instance_ = AutodiffStackSingleton_t::instance_; + } ~AutodiffStackSingleton() { if (own_instance_) { delete instance_; diff --git a/stan/math/rev/core/recover_memory_global.hpp b/stan/math/rev/core/recover_memory_global.hpp index 9c54d443dee..c961948b6a2 100644 --- a/stan/math/rev/core/recover_memory_global.hpp +++ b/stan/math/rev/core/recover_memory_global.hpp @@ -6,6 +6,8 @@ #include #include +#include + namespace stan { namespace math { @@ -25,15 +27,20 @@ static inline void recover_memory_global() { } */ - for (const auto& kv : internal::global_observer.thread_tape_map) { - ChainableStack& instance_ = *(kv.second->active_instance_); - instance_.var_stack_.clear(); - instance_.var_nochain_stack_.clear(); - for (auto& x : instance_.var_alloc_stack_) { + for (auto& kv : internal::global_observer.thread_tape_map_) { + ChainableStack::AutodiffStackStorage* instance_ = kv.second->active_instance_; + std::cout << "reference ptr: " << instance_ << std::endl; + if (instance_ == nullptr) { + std::cout << "nullptr ??" << std::endl; + continue; + } + instance_->var_stack_.clear(); + instance_->var_nochain_stack_.clear(); + for (auto& x : instance_->var_alloc_stack_) { delete x; } - instance_.var_alloc_stack_.clear(); - instance_.memalloc_.recover_all(); + instance_->var_alloc_stack_.clear(); + instance_->memalloc_.recover_all(); } } From eec74724be5cfdd9e410d9a74e48a5d2911a9f6d Mon Sep 17 00:00:00 2001 From: Sebastian Weber Date: Tue, 31 Dec 2019 15:25:25 +0100 Subject: [PATCH 009/189] remove debugging msg --- stan/math/rev/core/recover_memory_global.hpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/stan/math/rev/core/recover_memory_global.hpp b/stan/math/rev/core/recover_memory_global.hpp index c961948b6a2..e6377ba7eb0 100644 --- a/stan/math/rev/core/recover_memory_global.hpp +++ b/stan/math/rev/core/recover_memory_global.hpp @@ -27,13 +27,15 @@ static inline void recover_memory_global() { } */ + std::size_t i = 0; + for (auto& kv : internal::global_observer.thread_tape_map_) { ChainableStack::AutodiffStackStorage* instance_ = kv.second->active_instance_; - std::cout << "reference ptr: " << instance_ << std::endl; if (instance_ == nullptr) { - std::cout << "nullptr ??" << std::endl; + std::cout << "nullptr for thread " << i << " ?? "<< std::endl; continue; } + i++; instance_->var_stack_.clear(); instance_->var_nochain_stack_.clear(); for (auto& x : instance_->var_alloc_stack_) { From 2d696c8cd506e8a12cff494961a1a2cc07c180cd Mon Sep 17 00:00:00 2001 From: Sebastian Weber Date: Wed, 1 Jan 2020 13:57:09 +0100 Subject: [PATCH 010/189] omit recover_memory_global which is not needed --- stan/math/rev/core.hpp | 1 - stan/math/rev/core/autodiffstackstorage.hpp | 7 +-- stan/math/rev/core/init_chainablestack.hpp | 7 +-- stan/math/rev/core/recover_memory_global.hpp | 51 -------------------- 4 files changed, 5 insertions(+), 61 deletions(-) delete mode 100644 stan/math/rev/core/recover_memory_global.hpp diff --git a/stan/math/rev/core.hpp b/stan/math/rev/core.hpp index b44f1cce469..db951f75576 100644 --- a/stan/math/rev/core.hpp +++ b/stan/math/rev/core.hpp @@ -43,7 +43,6 @@ #include #include #include -#include #include #include #include diff --git a/stan/math/rev/core/autodiffstackstorage.hpp b/stan/math/rev/core/autodiffstackstorage.hpp index 860fb5de95a..9009e1b0da0 100644 --- a/stan/math/rev/core/autodiffstackstorage.hpp +++ b/stan/math/rev/core/autodiffstackstorage.hpp @@ -90,10 +90,7 @@ struct AutodiffStackSingleton { using AutodiffStackSingleton_t = AutodiffStackSingleton; - AutodiffStackSingleton() - : own_instance_(init()), active_instance_(nullptr) { - active_instance_ = AutodiffStackSingleton_t::instance_; - } + AutodiffStackSingleton() : own_instance_(init()) {} ~AutodiffStackSingleton() { if (own_instance_) { delete instance_; @@ -120,8 +117,6 @@ struct AutodiffStackSingleton { static STAN_THREADS_DEF AutodiffStackStorage *instance_; - AutodiffStackStorage *active_instance_; - private: static bool init() { static STAN_THREADS_DEF bool is_initialized = false; diff --git a/stan/math/rev/core/init_chainablestack.hpp b/stan/math/rev/core/init_chainablestack.hpp index 54646d85d66..8f1b903cc9e 100644 --- a/stan/math/rev/core/init_chainablestack.hpp +++ b/stan/math/rev/core/init_chainablestack.hpp @@ -53,15 +53,16 @@ class ad_tape_observer : public tbb::task_scheduler_observer { } } + private: ad_map thread_tape_map_; std::mutex thread_tape_map_mutex_; }; -namespace internal { +namespace { -static ad_tape_observer global_observer; +ad_tape_observer global_observer; -} // namespace internal +} // namespace } // namespace math } // namespace stan diff --git a/stan/math/rev/core/recover_memory_global.hpp b/stan/math/rev/core/recover_memory_global.hpp deleted file mode 100644 index e6377ba7eb0..00000000000 --- a/stan/math/rev/core/recover_memory_global.hpp +++ /dev/null @@ -1,51 +0,0 @@ -#ifndef STAN_MATH_REV_CORE_RECOVER_MEMORY_GLOBAL_HPP -#define STAN_MATH_REV_CORE_RECOVER_MEMORY_GLOBAL_HPP - -#include -#include -#include -#include - -#include - -namespace stan { -namespace math { - -/** - * Recover memory global used for all variables for reuse in all - * threads. - * - * @throw std::logic_error if empty_nested() returns - * false - */ -static inline void recover_memory_global() { - /* todo: check for each thread - if (!empty_nested()) { - throw std::logic_error( - "empty_nested() must be true" - " before calling recover_memory()"); - } - */ - - std::size_t i = 0; - - for (auto& kv : internal::global_observer.thread_tape_map_) { - ChainableStack::AutodiffStackStorage* instance_ = kv.second->active_instance_; - if (instance_ == nullptr) { - std::cout << "nullptr for thread " << i << " ?? "<< std::endl; - continue; - } - i++; - instance_->var_stack_.clear(); - instance_->var_nochain_stack_.clear(); - for (auto& x : instance_->var_alloc_stack_) { - delete x; - } - instance_->var_alloc_stack_.clear(); - instance_->memalloc_.recover_all(); - } -} - -} // namespace math -} // namespace stan -#endif From 6d8ad8a4f7b71774d2578828f6326f5be9c50d02 Mon Sep 17 00:00:00 2001 From: Sebastian Weber Date: Wed, 1 Jan 2020 14:16:37 +0100 Subject: [PATCH 011/189] aggregate more efficiently the partial sums --- .../rev/scal/functor/parallel_reduce_sum.hpp | 52 +++++++++---------- 1 file changed, 25 insertions(+), 27 deletions(-) diff --git a/stan/math/rev/scal/functor/parallel_reduce_sum.hpp b/stan/math/rev/scal/functor/parallel_reduce_sum.hpp index c0f22dd6ea0..eb578c0de8a 100644 --- a/stan/math/rev/scal/functor/parallel_reduce_sum.hpp +++ b/stan/math/rev/scal/functor/parallel_reduce_sum.hpp @@ -21,14 +21,15 @@ namespace internal { template struct parallel_reduce_sum_impl { - using ops_partials_t = operands_and_partials>; + using ops_partials_t + = operands_and_partials, const std::vector>; struct recursive_reducer { InputIt first_; std::vector& varg1_; const std::vector& idata1_; - std::vector> terms_adjs_; - std::vector terms_vals_; + ops_partials_t terms_partials_; + double terms_sum_; typedef typename std::iterator_traits::value_type elem_t; recursive_reducer(InputIt first, const T& init, std::vector& varg1, @@ -36,16 +37,15 @@ struct parallel_reduce_sum_impl { : first_(first), varg1_(varg1), idata1_(idata1), - terms_adjs_(1, std::vector(varg1.size(), 0.0)), - terms_vals_(1, init.val()) {} + terms_partials_(varg1_, idata1_), + terms_sum_(init.val()) {} recursive_reducer(recursive_reducer& other, tbb::split) : first_(other.first_), varg1_(other.varg1_), idata1_(other.idata1_), - terms_adjs_(), - terms_vals_() {} - // terms_adjs_(other.terms_adjs_), terms_vals_(other.terms_vals_) {} + terms_partials_(varg1_, idata1_), + terms_sum_(0.0) {} void operator()(const tbb::blocked_range& r) { if (r.empty()) @@ -64,7 +64,13 @@ struct parallel_reduce_sum_impl { // right now we discard that it is a var if it would be) std::vector sub_slice(start, end); - // create a deep copy of arg1 which is not tied to the outer AD tree + // create a deep copy of arg1 which is not tied to the outer + // AD tree + // todo: these copies can be put onto a thread-local storage + // object such that we only create these once per thread which + // should significantly boost performance as many copies are + // avoided... this comes at the cost that the adjoints have to + // be zeroed "manually" std::vector varg1; varg1.reserve(varg1_.size()); @@ -76,13 +82,10 @@ struct parallel_reduce_sum_impl { sub_sum_v.grad(); - terms_vals_.push_back(sub_sum_v.val()); + terms_sum_ += sub_sum_v.val(); - std::vector varg1_adj; - for (const Arg1& elem : varg1) - varg1_adj.push_back(elem.adj()); - - terms_adjs_.push_back(varg1_adj); + for (std::size_t i = 0; i != varg1.size(); ++i) + terms_partials_.edge1_.partials_[i] += varg1[i].adj(); } catch (const std::exception& e) { recover_memory_nested(); @@ -90,11 +93,13 @@ struct parallel_reduce_sum_impl { } recover_memory_nested(); } + void join(const recursive_reducer& child) { - terms_adjs_.insert(terms_adjs_.end(), child.terms_adjs_.begin(), - child.terms_adjs_.end()); - terms_vals_.insert(terms_vals_.end(), child.terms_vals_.begin(), - child.terms_vals_.end()); + terms_sum_ += child.terms_sum_; + + for (std::size_t i = 0; i != varg1_.size(); ++i) + terms_partials_.edge1_.partials_[i] + += child.terms_partials_.edge1_.partials_[i]; } }; @@ -104,14 +109,7 @@ struct parallel_reduce_sum_impl { recursive_reducer worker(first, init, varg1, idata); tbb::parallel_reduce( tbb::blocked_range(0, num_jobs, grainsize), worker); - ops_partials_t ops_sum(varg1); - double sum = 0.0; - for (std::size_t i = 0; i != worker.terms_adjs_.size(); ++i) { - sum += worker.terms_vals_[i]; - for (std::size_t j = 0; j != worker.terms_adjs_[i].size(); ++j) - ops_sum.edge1_.partials_[j] += worker.terms_adjs_[i][j]; - } - return ops_sum.build(sum); + return worker.terms_partials_.build(worker.terms_sum_); } }; } // namespace internal From 1a54bde31e43543445db96efb6b90c6c87999793 Mon Sep 17 00:00:00 2001 From: Sebastian Weber Date: Thu, 2 Jan 2020 10:54:10 +0100 Subject: [PATCH 012/189] simplify how values are copied --- stan/math/rev/scal/functor/parallel_reduce_sum.hpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/stan/math/rev/scal/functor/parallel_reduce_sum.hpp b/stan/math/rev/scal/functor/parallel_reduce_sum.hpp index eb578c0de8a..9e20a448d62 100644 --- a/stan/math/rev/scal/functor/parallel_reduce_sum.hpp +++ b/stan/math/rev/scal/functor/parallel_reduce_sum.hpp @@ -71,11 +71,18 @@ struct parallel_reduce_sum_impl { // should significantly boost performance as many copies are // avoided... this comes at the cost that the adjoints have to // be zeroed "manually" + /* using the var_nochain_stack_ tape avoids up-propagation of chain std::vector varg1; varg1.reserve(varg1_.size()); for (Arg1& elem : varg1_) varg1.emplace_back(var(new vari(elem.val(), false))); + */ + + // but this should actually do the same if we simply + // instantiate the var from a double representation + std::vector varg1_d = value_of(varg1_); + std::vector varg1(varg1_d.begin(), varg1_d.end()); T sub_sum_v = ReduceFunction()(r.begin(), r.end() - 1, sub_slice, varg1, idata1_); From f689c93481cb42167b5e5df6bb14a66c356039f4 Mon Sep 17 00:00:00 2001 From: Sebastian Weber Date: Thu, 2 Jan 2020 11:09:40 +0100 Subject: [PATCH 013/189] const correctness --- .../prim/scal/functor/parallel_reduce_sum.hpp | 11 +++++++---- .../rev/scal/functor/parallel_reduce_sum.hpp | 16 ++++++++++------ 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/stan/math/prim/scal/functor/parallel_reduce_sum.hpp b/stan/math/prim/scal/functor/parallel_reduce_sum.hpp index 415b925f8cd..b2fc6669abe 100644 --- a/stan/math/prim/scal/functor/parallel_reduce_sum.hpp +++ b/stan/math/prim/scal/functor/parallel_reduce_sum.hpp @@ -28,12 +28,13 @@ template struct parallel_reduce_sum_impl { struct recursive_reducer { InputIt first_; - std::vector& varg1_; + const std::vector& varg1_; const std::vector& idata1_; T sum_; typedef typename std::iterator_traits::value_type elem_t; - recursive_reducer(InputIt first, const T& init, std::vector& varg1, + recursive_reducer(InputIt first, const T& init, + const std::vector& varg1, const std::vector& idata1) : first_(first), varg1_(varg1), idata1_(idata1), sum_(init) {} @@ -62,7 +63,8 @@ struct parallel_reduce_sum_impl { }; T operator()(InputIt first, InputIt last, T init, std::size_t grainsize, - std::vector& varg1, const std::vector& idata1) const { + const std::vector& varg1, + const std::vector& idata1) const { const std::size_t num_jobs = std::distance(first, last); recursive_reducer worker(first, init, varg1, idata1); tbb::parallel_reduce( @@ -114,7 +116,8 @@ struct parallel_reduce_sum_impl { */ template constexpr T parallel_reduce_sum(InputIt first, InputIt last, T init, - std::size_t grainsize, std::vector& varg1, + std::size_t grainsize, + const std::vector& varg1, const std::vector& idata1) { typedef T return_base_t; return internal::parallel_reduce_sum_impl { struct recursive_reducer { InputIt first_; - std::vector& varg1_; + const std::vector& varg1_; const std::vector& idata1_; ops_partials_t terms_partials_; double terms_sum_; typedef typename std::iterator_traits::value_type elem_t; - recursive_reducer(InputIt first, const T& init, std::vector& varg1, + recursive_reducer(InputIt first, const T& init, + const std::vector& varg1, const std::vector& idata1) : first_(first), varg1_(varg1), @@ -81,7 +82,7 @@ struct parallel_reduce_sum_impl { // but this should actually do the same if we simply // instantiate the var from a double representation - std::vector varg1_d = value_of(varg1_); + const std::vector varg1_d = value_of(varg1_); std::vector varg1(varg1_d.begin(), varg1_d.end()); T sub_sum_v = ReduceFunction()(r.begin(), r.end() - 1, sub_slice, varg1, @@ -111,11 +112,14 @@ struct parallel_reduce_sum_impl { }; T operator()(InputIt first, InputIt last, T init, std::size_t grainsize, - std::vector& varg1, const std::vector& idata) const { + const std::vector& varg1, + const std::vector& idata) const { const std::size_t num_jobs = std::distance(first, last); recursive_reducer worker(first, init, varg1, idata); - tbb::parallel_reduce( - tbb::blocked_range(0, num_jobs, grainsize), worker); + tbb::static_partitioner work_partitioner; + tbb::parallel_deterministic_reduce( + tbb::blocked_range(0, num_jobs, grainsize), worker, + work_partitioner); return worker.terms_partials_.build(worker.terms_sum_); } }; From 0da9de09f51243cb6c8c7fa01a2f7e9fe2de71fd Mon Sep 17 00:00:00 2001 From: Sebastian Weber Date: Thu, 2 Jan 2020 19:28:41 +0100 Subject: [PATCH 014/189] make code more generic (some meta magic bits are missing) --- .../prim/scal/functor/parallel_reduce_sum.hpp | 76 +++++---- .../rev/scal/functor/parallel_reduce_sum.hpp | 153 ++++++++++++------ .../functor/parallel_v3_reduce_sum_test.cpp | 20 +-- 3 files changed, 159 insertions(+), 90 deletions(-) diff --git a/stan/math/prim/scal/functor/parallel_reduce_sum.hpp b/stan/math/prim/scal/functor/parallel_reduce_sum.hpp index b2fc6669abe..16e316819ac 100644 --- a/stan/math/prim/scal/functor/parallel_reduce_sum.hpp +++ b/stan/math/prim/scal/functor/parallel_reduce_sum.hpp @@ -20,56 +20,62 @@ namespace math { namespace internal { // base definition => compile error -template struct parallel_reduce_sum_impl {}; -template -struct parallel_reduce_sum_impl { - struct recursive_reducer { - InputIt first_; - const std::vector& varg1_; - const std::vector& idata1_; - T sum_; - typedef typename std::iterator_traits::value_type elem_t; +template +struct parallel_reduce_sum_impl { + using vmapped_t = std::vector; + using arg1_t = std::vector; + using arg2_t = std::vector; - recursive_reducer(InputIt first, const T& init, - const std::vector& varg1, - const std::vector& idata1) - : first_(first), varg1_(varg1), idata1_(idata1), sum_(init) {} + struct recursive_reducer { + const vmapped_t& vmapped_; + const arg1_t& arg1_; + const arg2_t& arg2_; + T terms_sum_; + + recursive_reducer(const vmapped_t& vmapped, const T& init, + const arg1_t& arg1, const arg2_t& arg2) + : vmapped_(vmapped), + arg1_(arg1), + arg2_(arg2), + terms_sum_(value_of(init)) {} recursive_reducer(recursive_reducer& other, tbb::split) - : first_(other.first_), - varg1_(other.varg1_), - idata1_(other.idata1_), - sum_(T(0.0)) {} + : vmapped_(other.vmapped_), + arg1_(other.arg1_), + arg2_(other.arg2_), + terms_sum_(0.0) {} void operator()(const tbb::blocked_range& r) { if (r.empty()) return; - auto start = first_; + auto start = vmapped_.begin(); std::advance(start, r.begin()); - auto end = first_; + auto end = vmapped_.begin(); std::advance(end, r.end()); - std::vector sub_slice(start, end); + vmapped_t sub_slice(start, end); - sum_ += ReduceFunction()(r.begin(), r.end() - 1, sub_slice, varg1_, - idata1_); + terms_sum_ + += ReduceFunction()(r.begin(), r.end() - 1, sub_slice, arg1_, arg2_); } - void join(const recursive_reducer& child) { sum_ += child.sum_; } + void join(const recursive_reducer& child) { + terms_sum_ += child.terms_sum_; + } }; - T operator()(InputIt first, InputIt last, T init, std::size_t grainsize, - const std::vector& varg1, - const std::vector& idata1) const { - const std::size_t num_jobs = std::distance(first, last); - recursive_reducer worker(first, init, varg1, idata1); + T operator()(const vmapped_t& vmapped, T init, std::size_t grainsize, + const arg1_t& arg1, const arg2_t& arg2) const { + const std::size_t num_jobs = vmapped.size(); + recursive_reducer worker(vmapped, init, arg1, arg2); tbb::parallel_reduce( tbb::blocked_range(0, num_jobs, grainsize), worker); - return std::move(worker.sum_); + return std::move(worker.terms_sum_); } }; } // namespace internal @@ -114,15 +120,15 @@ struct parallel_reduce_sum_impl { * the functor must be default constructible without any arguments. * */ -template -constexpr T parallel_reduce_sum(InputIt first, InputIt last, T init, +template +constexpr T parallel_reduce_sum(const std::vector& vmapped, T init, std::size_t grainsize, - const std::vector& varg1, - const std::vector& idata1) { + const std::vector& arg1, + const std::vector& arg2) { typedef T return_base_t; - return internal::parallel_reduce_sum_impl()( - first, last, init, grainsize, varg1, idata1); + vmapped, init, grainsize, arg1, arg2); } } // namespace math diff --git a/stan/math/rev/scal/functor/parallel_reduce_sum.hpp b/stan/math/rev/scal/functor/parallel_reduce_sum.hpp index 8c8a6e62fb7..97ddfadba49 100644 --- a/stan/math/rev/scal/functor/parallel_reduce_sum.hpp +++ b/stan/math/rev/scal/functor/parallel_reduce_sum.hpp @@ -1,7 +1,7 @@ #ifndef STAN_MATH_REV_SCAL_FUNCTOR_PARALLEL_REDUCE_SUM_HPP #define STAN_MATH_REV_SCAL_FUNCTOR_PARALLEL_REDUCE_SUM_HPP -#include +#include #include #include @@ -19,51 +19,69 @@ namespace math { namespace internal { -template -struct parallel_reduce_sum_impl { +template +struct parallel_reduce_sum_impl { + using vmapped_t = std::vector; + using arg1_t = std::vector; + using arg2_t = std::vector; + + using vmapped_value_t = std::vector::type>; + using arg1_value_t = std::vector::type>; + // using arg2_value_t = std::vector::type>; + using arg2_value_t = std::vector; + using ops_partials_t - = operands_and_partials, const std::vector>; + = operands_and_partials, std::vector>; struct recursive_reducer { - InputIt first_; - const std::vector& varg1_; - const std::vector& idata1_; - ops_partials_t terms_partials_; + const vmapped_t& vmapped_; + const arg1_t& arg1_; + const arg1_value_t& arg1_value_; + const arg2_t& arg2_; + const arg2_value_t& arg2_value_; + + ops_partials_t& terms_partials_mapped_; + ops_partials_t terms_partials_args_; double terms_sum_; - typedef typename std::iterator_traits::value_type elem_t; - recursive_reducer(InputIt first, const T& init, - const std::vector& varg1, - const std::vector& idata1) - : first_(first), - varg1_(varg1), - idata1_(idata1), - terms_partials_(varg1_, idata1_), - terms_sum_(init.val()) {} + recursive_reducer(const vmapped_t& vmapped, const T& init, + const arg1_t& arg1, const arg1_value_t& arg1_value, + const arg2_t& arg2, const arg2_value_t& arg2_value, + ops_partials_t& terms_partials_mapped) + : vmapped_(vmapped), + arg1_(arg1), + arg1_value_(arg1_value), + arg2_(arg2), + arg2_value_(arg2_value), + terms_partials_mapped_(terms_partials_mapped), + terms_partials_args_(vmapped, arg1, arg2), + terms_sum_(value_of(init)) {} recursive_reducer(recursive_reducer& other, tbb::split) - : first_(other.first_), - varg1_(other.varg1_), - idata1_(other.idata1_), - terms_partials_(varg1_, idata1_), + : vmapped_(other.vmapped_), + arg1_(other.arg1_), + arg1_value_(other.arg1_value_), + arg2_(other.arg2_), + arg2_value_(other.arg2_value_), + terms_partials_mapped_(other.terms_partials_mapped_), + terms_partials_args_(vmapped_, arg1_, arg2_), terms_sum_(0.0) {} void operator()(const tbb::blocked_range& r) { if (r.empty()) return; - auto start = first_; - std::advance(start, r.begin()); - auto end = first_; - std::advance(end, r.end()); - try { start_nested(); // elem_t should better be a non-var!!! Otherwise we interfere // with the outer AD tree as we mess with the adjoints (and // right now we discard that it is a var if it would be) - std::vector sub_slice(start, end); + vmapped_t local_sub_slice; + + local_sub_slice.reserve(r.size()); + for (std::size_t i = r.begin(); i != r.end(); ++i) + local_sub_slice.emplace_back(value_of(vmapped_[i])); // create a deep copy of arg1 which is not tied to the outer // AD tree @@ -82,18 +100,38 @@ struct parallel_reduce_sum_impl { // but this should actually do the same if we simply // instantiate the var from a double representation - const std::vector varg1_d = value_of(varg1_); - std::vector varg1(varg1_d.begin(), varg1_d.end()); + // const std::vector varg1_d = value_of(varg1_); + // std::vector varg1(varg1_d.begin(), varg1_d.end()); + + arg1_t local_arg1(arg1_value_.begin(), arg1_value_.end()); + arg2_t local_arg2(arg2_value_.begin(), arg2_value_.end()); - T sub_sum_v = ReduceFunction()(r.begin(), r.end() - 1, sub_slice, varg1, - idata1_); + T sub_sum_v = ReduceFunction()(r.begin(), r.end() - 1, local_sub_slice, + local_arg1, local_arg2); sub_sum_v.grad(); terms_sum_ += sub_sum_v.val(); - for (std::size_t i = 0; i != varg1.size(); ++i) - terms_partials_.edge1_.partials_[i] += varg1[i].adj(); + /* + if (!is_constant_all::value) { + for (std::size_t i = 0; i != r.size(); ++i) + terms_partials_mapped_.edge1_.partials_[r.begin() + i] += + local_sub_slice[i].adj(); + } + */ + + if (!is_constant_all::value) { + for (std::size_t i = 0; i != local_arg1.size(); ++i) + terms_partials_args_.edge2_.partials_[i] += local_arg1[i].adj(); + } + + /* + if (!is_constant_all::value) { + for (std::size_t i = 0; i != local_arg2.size(); ++i) + terms_partials_args_.edge3_.partials_[i] += local_arg2[i].adj(); + } + */ } catch (const std::exception& e) { recover_memory_nested(); @@ -105,22 +143,45 @@ struct parallel_reduce_sum_impl { void join(const recursive_reducer& child) { terms_sum_ += child.terms_sum_; - for (std::size_t i = 0; i != varg1_.size(); ++i) - terms_partials_.edge1_.partials_[i] - += child.terms_partials_.edge1_.partials_[i]; + if (!is_constant_all::value) { + for (std::size_t i = 0; i != arg1_.size(); ++i) + terms_partials_args_.edge2_.partials_[i] + += child.terms_partials_args_.edge2_.partials_[i]; + } + + /* + if (!is_constant_all::value) { + for (std::size_t i = 0; i != arg2_.size(); ++i) + terms_partials_args_.edge3_.partials_[i] + += child.terms_partials_args_.edge3_.partials_[i]; + } + */ } }; - T operator()(InputIt first, InputIt last, T init, std::size_t grainsize, - const std::vector& varg1, - const std::vector& idata) const { - const std::size_t num_jobs = std::distance(first, last); - recursive_reducer worker(first, init, varg1, idata); - tbb::static_partitioner work_partitioner; - tbb::parallel_deterministic_reduce( - tbb::blocked_range(0, num_jobs, grainsize), worker, - work_partitioner); - return worker.terms_partials_.build(worker.terms_sum_); + T operator()(const vmapped_t& vmapped, T init, std::size_t grainsize, + const arg1_t& arg1, const arg2_t& arg2) const { + const std::size_t num_jobs = vmapped.size(); + arg1_value_t arg1_value = value_of(arg1); + arg2_value_t arg2_value = value_of(arg2); + + ops_partials_t ops(vmapped, arg1, arg2); + + recursive_reducer worker(vmapped, init, arg1, arg1_value, arg2, arg2_value, + ops); + + tbb::parallel_reduce( + tbb::blocked_range(0, num_jobs, grainsize), worker); + + if (!is_constant_all::value) { + ops.edge2_.partials_ = worker.terms_partials_args_.edge2_.partials_; + } + + if (!is_constant_all::value) { + ops.edge3_.partials_ = worker.terms_partials_args_.edge3_.partials_; + } + + return ops.build(worker.terms_sum_); } }; } // namespace internal diff --git a/test/unit/math/rev/scal/functor/parallel_v3_reduce_sum_test.cpp b/test/unit/math/rev/scal/functor/parallel_v3_reduce_sum_test.cpp index d9616dcc150..e9d652dcfe0 100644 --- a/test/unit/math/rev/scal/functor/parallel_v3_reduce_sum_test.cpp +++ b/test/unit/math/rev/scal/functor/parallel_v3_reduce_sum_test.cpp @@ -14,7 +14,8 @@ struct count_lpdf { // does the reduction in the sub-slice start to end inline T operator()(std::size_t start, std::size_t end, - std::vector& sub_slice, std::vector& lambda, + const std::vector& sub_slice, + const std::vector& lambda, const std::vector& idata) const { return stan::math::poisson_lpmf(sub_slice, lambda[0]); } @@ -37,7 +38,7 @@ TEST(parallel_v3_reduce_sum, value) { typedef boost::counting_iterator count_iter; double poisson_lpdf = stan::math::parallel_reduce_sum>( - data.begin(), data.end(), 0.0, 5, vlambda_d, idata); + data, 0.0, 5, vlambda_d, idata); double poisson_lpdf_ref = stan::math::poisson_lpmf(data, lambda_d); @@ -70,7 +71,7 @@ TEST(parallel_v3_reduce_sum, gradient) { std::vector vlambda_v(1, lambda_v); var poisson_lpdf = stan::math::parallel_reduce_sum>( - data.begin(), data.end(), var(0.0), 5, vlambda_v, idata); + data, var(0.0), 5, vlambda_v, idata); var lambda_ref = lambda_d; var poisson_lpdf_ref = stan::math::poisson_lpmf(data, lambda_ref); @@ -109,10 +110,11 @@ struct nesting_count_lpdf { // does the reduction in the sub-slice start to end inline T operator()(std::size_t start, std::size_t end, - std::vector& sub_slice, std::vector& lambda, + const std::vector& sub_slice, + const std::vector& lambda, const std::vector& idata) const { - return stan::math::parallel_reduce_sum>( - sub_slice.begin(), sub_slice.end(), T(0.0), 5, lambda, idata); + return stan::math::parallel_reduce_sum>(sub_slice, T(0.0), 5, + lambda, idata); } }; @@ -133,7 +135,7 @@ TEST(parallel_v3_reduce_sum, nesting_value) { typedef boost::counting_iterator count_iter; double poisson_lpdf = stan::math::parallel_reduce_sum>( - data.begin(), data.end(), 0.0, 5, vlambda_d, idata); + data, 0.0, 5, vlambda_d, idata); double poisson_lpdf_ref = stan::math::poisson_lpmf(data, lambda_d); @@ -166,7 +168,7 @@ TEST(parallel_v3_reduce_sum, nesting_gradient) { std::vector vlambda_v(1, lambda_v); var poisson_lpdf = stan::math::parallel_reduce_sum>( - data.begin(), data.end(), var(0.0), 5, vlambda_v, idata); + data, var(0.0), 5, vlambda_v, idata); var lambda_ref = lambda_d; var poisson_lpdf_ref = stan::math::poisson_lpmf(data, lambda_ref); @@ -245,7 +247,7 @@ TEST(parallel_v3_reduce_sum, grouped_gradient) { var lambda_v = vlambda_v[0]; var poisson_lpdf = stan::math::parallel_reduce_sum>( - data.begin(), data.end(), var(0.0), 5, vlambda_v, gidx); + data, var(0.0), 5, vlambda_v, gidx); std::vector vref_lambda_v; for (std::size_t i = 0; i != elems; ++i) { From 73fb5ae75c81d95e2209436d041b10ad8b450da8 Mon Sep 17 00:00:00 2001 From: Sebastian Weber Date: Fri, 3 Jan 2020 13:33:48 +0100 Subject: [PATCH 015/189] make parallel reduce sum work with posted example... need more meta-programs to handle general case --- .../rev/scal/functor/parallel_reduce_sum.hpp | 20 +++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/stan/math/rev/scal/functor/parallel_reduce_sum.hpp b/stan/math/rev/scal/functor/parallel_reduce_sum.hpp index 97ddfadba49..3809a25bb84 100644 --- a/stan/math/rev/scal/functor/parallel_reduce_sum.hpp +++ b/stan/math/rev/scal/functor/parallel_reduce_sum.hpp @@ -17,6 +17,11 @@ namespace stan { namespace math { +template <> +struct child_type { + using type = int; +}; + namespace internal { template @@ -27,8 +32,7 @@ struct parallel_reduce_sum_impl { using vmapped_value_t = std::vector::type>; using arg1_value_t = std::vector::type>; - // using arg2_value_t = std::vector::type>; - using arg2_value_t = std::vector; + using arg2_value_t = std::vector::type>; using ops_partials_t = operands_and_partials, std::vector>; @@ -113,18 +117,20 @@ struct parallel_reduce_sum_impl { terms_sum_ += sub_sum_v.val(); - /* + /**/ if (!is_constant_all::value) { for (std::size_t i = 0; i != r.size(); ++i) - terms_partials_mapped_.edge1_.partials_[r.begin() + i] += - local_sub_slice[i].adj(); + terms_partials_mapped_.edge1_.partials_[r.begin() + i] + += local_sub_slice[i].adj(); } - */ + /**/ + /* if (!is_constant_all::value) { for (std::size_t i = 0; i != local_arg1.size(); ++i) terms_partials_args_.edge2_.partials_[i] += local_arg1[i].adj(); } + */ /* if (!is_constant_all::value) { @@ -143,11 +149,13 @@ struct parallel_reduce_sum_impl { void join(const recursive_reducer& child) { terms_sum_ += child.terms_sum_; + /* if (!is_constant_all::value) { for (std::size_t i = 0; i != arg1_.size(); ++i) terms_partials_args_.edge2_.partials_[i] += child.terms_partials_args_.edge2_.partials_[i]; } + */ /* if (!is_constant_all::value) { From e7a83c6dd6502504fb70f4060ce3d42d858dd2f3 Mon Sep 17 00:00:00 2001 From: Sebastian Weber Date: Sun, 5 Jan 2020 14:11:21 +0100 Subject: [PATCH 016/189] rename to reduce_sum --- stan/math/prim/scal.hpp | 2 +- ...parallel_reduce_sum.hpp => reduce_sum.hpp} | 16 ++++++------ stan/math/rev/scal.hpp | 2 +- ...parallel_reduce_sum.hpp => reduce_sum.hpp} | 18 ++++++------- .../functor/parallel_v3_reduce_sum_test.cpp | 26 +++++++++---------- 5 files changed, 32 insertions(+), 32 deletions(-) rename stan/math/prim/scal/functor/{parallel_reduce_sum.hpp => reduce_sum.hpp} (89%) rename stan/math/rev/scal/functor/{parallel_reduce_sum.hpp => reduce_sum.hpp} (96%) diff --git a/stan/math/prim/scal.hpp b/stan/math/prim/scal.hpp index ea4700b4722..d25d7e7b436 100644 --- a/stan/math/prim/scal.hpp +++ b/stan/math/prim/scal.hpp @@ -131,7 +131,7 @@ #include #include -#include +#include #include #include diff --git a/stan/math/prim/scal/functor/parallel_reduce_sum.hpp b/stan/math/prim/scal/functor/reduce_sum.hpp similarity index 89% rename from stan/math/prim/scal/functor/parallel_reduce_sum.hpp rename to stan/math/prim/scal/functor/reduce_sum.hpp index 16e316819ac..333e5ef6170 100644 --- a/stan/math/prim/scal/functor/parallel_reduce_sum.hpp +++ b/stan/math/prim/scal/functor/reduce_sum.hpp @@ -1,5 +1,5 @@ -#ifndef STAN_MATH_PRIM_SCAL_FUNCTOR_PARALLEL_REDUCE_SUM_HPP -#define STAN_MATH_PRIM_SCAL_FUNCTOR_PARALLEL_REDUCE_SUM_HPP +#ifndef STAN_MATH_PRIM_SCAL_FUNCTOR_REDUCE_SUM_HPP +#define STAN_MATH_PRIM_SCAL_FUNCTOR_REDUCE_SUM_HPP #include #include @@ -22,10 +22,10 @@ namespace internal { // base definition => compile error template -struct parallel_reduce_sum_impl {}; +struct reduce_sum_impl {}; template -struct parallel_reduce_sum_impl { +struct reduce_sum_impl { using vmapped_t = std::vector; using arg1_t = std::vector; using arg2_t = std::vector; @@ -104,10 +104,10 @@ struct parallel_reduce_sum_impl { * "..." thing in C++. The complication for this is managing all * the calls. * - * So the parallel_reduce_sum signature should look like + * So the reduce_sum signature should look like * * template - * constexpr T parallel_reduce_sum(InputIt first, InputIt last, T init, + * constexpr T reduce_sum(InputIt first, InputIt last, T init, * std::size_t grainsize, ...) * * which corresponds to a ReduceFunction which looks like @@ -121,12 +121,12 @@ struct parallel_reduce_sum_impl { * */ template -constexpr T parallel_reduce_sum(const std::vector& vmapped, T init, +constexpr T reduce_sum(const std::vector& vmapped, T init, std::size_t grainsize, const std::vector& arg1, const std::vector& arg2) { typedef T return_base_t; - return internal::parallel_reduce_sum_impl()( vmapped, init, grainsize, arg1, arg2); } diff --git a/stan/math/rev/scal.hpp b/stan/math/rev/scal.hpp index 2ac1387a702..a8f5a55a49b 100644 --- a/stan/math/rev/scal.hpp +++ b/stan/math/rev/scal.hpp @@ -102,6 +102,6 @@ #include #include -#include +#include #endif diff --git a/stan/math/rev/scal/functor/parallel_reduce_sum.hpp b/stan/math/rev/scal/functor/reduce_sum.hpp similarity index 96% rename from stan/math/rev/scal/functor/parallel_reduce_sum.hpp rename to stan/math/rev/scal/functor/reduce_sum.hpp index 3809a25bb84..df59c1d458b 100644 --- a/stan/math/rev/scal/functor/parallel_reduce_sum.hpp +++ b/stan/math/rev/scal/functor/reduce_sum.hpp @@ -1,5 +1,5 @@ -#ifndef STAN_MATH_REV_SCAL_FUNCTOR_PARALLEL_REDUCE_SUM_HPP -#define STAN_MATH_REV_SCAL_FUNCTOR_PARALLEL_REDUCE_SUM_HPP +#ifndef STAN_MATH_REV_SCAL_FUNCTOR_REDUCE_SUM_HPP +#define STAN_MATH_REV_SCAL_FUNCTOR_REDUCE_SUM_HPP #include #include @@ -25,7 +25,7 @@ struct child_type { namespace internal { template -struct parallel_reduce_sum_impl { +struct reduce_sum_impl { using vmapped_t = std::vector; using arg1_t = std::vector; using arg2_t = std::vector; @@ -117,20 +117,20 @@ struct parallel_reduce_sum_impl { terms_sum_ += sub_sum_v.val(); - /**/ + /* if (!is_constant_all::value) { for (std::size_t i = 0; i != r.size(); ++i) terms_partials_mapped_.edge1_.partials_[r.begin() + i] += local_sub_slice[i].adj(); } - /**/ + */ - /* + /**/ if (!is_constant_all::value) { for (std::size_t i = 0; i != local_arg1.size(); ++i) terms_partials_args_.edge2_.partials_[i] += local_arg1[i].adj(); } - */ + /**/ /* if (!is_constant_all::value) { @@ -149,13 +149,13 @@ struct parallel_reduce_sum_impl { void join(const recursive_reducer& child) { terms_sum_ += child.terms_sum_; - /* + /**/ if (!is_constant_all::value) { for (std::size_t i = 0; i != arg1_.size(); ++i) terms_partials_args_.edge2_.partials_[i] += child.terms_partials_args_.edge2_.partials_[i]; } - */ + /**/ /* if (!is_constant_all::value) { diff --git a/test/unit/math/rev/scal/functor/parallel_v3_reduce_sum_test.cpp b/test/unit/math/rev/scal/functor/parallel_v3_reduce_sum_test.cpp index e9d652dcfe0..d879aa04d21 100644 --- a/test/unit/math/rev/scal/functor/parallel_v3_reduce_sum_test.cpp +++ b/test/unit/math/rev/scal/functor/parallel_v3_reduce_sum_test.cpp @@ -21,7 +21,7 @@ struct count_lpdf { } }; -TEST(parallel_v3_reduce_sum, value) { +TEST(v3_reduce_sum, value) { stan::math::init_threadpool_tbb(); double lambda_d = 10.0; @@ -37,7 +37,7 @@ TEST(parallel_v3_reduce_sum, value) { typedef boost::counting_iterator count_iter; - double poisson_lpdf = stan::math::parallel_reduce_sum>( + double poisson_lpdf = stan::math::reduce_sum>( data, 0.0, 5, vlambda_d, idata); double poisson_lpdf_ref = stan::math::poisson_lpmf(data, lambda_d); @@ -51,7 +51,7 @@ TEST(parallel_v3_reduce_sum, value) { ; } -TEST(parallel_v3_reduce_sum, gradient) { +TEST(v3_reduce_sum, gradient) { stan::math::init_threadpool_tbb(); double lambda_d = 10.0; @@ -70,8 +70,8 @@ TEST(parallel_v3_reduce_sum, gradient) { std::vector idata; std::vector vlambda_v(1, lambda_v); - var poisson_lpdf = stan::math::parallel_reduce_sum>( - data, var(0.0), 5, vlambda_v, idata); + var poisson_lpdf = stan::math::reduce_sum>(data, var(0.0), 5, + vlambda_v, idata); var lambda_ref = lambda_d; var poisson_lpdf_ref = stan::math::poisson_lpmf(data, lambda_ref); @@ -113,12 +113,12 @@ struct nesting_count_lpdf { const std::vector& sub_slice, const std::vector& lambda, const std::vector& idata) const { - return stan::math::parallel_reduce_sum>(sub_slice, T(0.0), 5, - lambda, idata); + return stan::math::reduce_sum>(sub_slice, T(0.0), 5, lambda, + idata); } }; -TEST(parallel_v3_reduce_sum, nesting_value) { +TEST(v3_reduce_sum, nesting_value) { stan::math::init_threadpool_tbb(); double lambda_d = 10.0; @@ -134,7 +134,7 @@ TEST(parallel_v3_reduce_sum, nesting_value) { typedef boost::counting_iterator count_iter; - double poisson_lpdf = stan::math::parallel_reduce_sum>( + double poisson_lpdf = stan::math::reduce_sum>( data, 0.0, 5, vlambda_d, idata); double poisson_lpdf_ref = stan::math::poisson_lpmf(data, lambda_d); @@ -148,7 +148,7 @@ TEST(parallel_v3_reduce_sum, nesting_value) { ; } -TEST(parallel_v3_reduce_sum, nesting_gradient) { +TEST(v3_reduce_sum, nesting_gradient) { stan::math::init_threadpool_tbb(); double lambda_d = 10.0; @@ -167,7 +167,7 @@ TEST(parallel_v3_reduce_sum, nesting_gradient) { std::vector idata; std::vector vlambda_v(1, lambda_v); - var poisson_lpdf = stan::math::parallel_reduce_sum>( + var poisson_lpdf = stan::math::reduce_sum>( data, var(0.0), 5, vlambda_v, idata); var lambda_ref = lambda_d; @@ -220,7 +220,7 @@ struct grouped_count_lpdf { } }; -TEST(parallel_v3_reduce_sum, grouped_gradient) { +TEST(v3_reduce_sum, grouped_gradient) { stan::math::init_threadpool_tbb(); double lambda_d = 10.0; @@ -246,7 +246,7 @@ TEST(parallel_v3_reduce_sum, grouped_gradient) { var lambda_v = vlambda_v[0]; - var poisson_lpdf = stan::math::parallel_reduce_sum>( + var poisson_lpdf = stan::math::reduce_sum>( data, var(0.0), 5, vlambda_v, gidx); std::vector vref_lambda_v; From 2a370e73b5a22cf3c9c8f5f2d815a32db80a456a Mon Sep 17 00:00:00 2001 From: Sebastian Weber Date: Sun, 5 Jan 2020 15:28:38 +0100 Subject: [PATCH 017/189] add up to 4 arguments for reduce function --- stan/math/prim/meta/broadcast_array.hpp | 1 + stan/math/prim/scal/functor/reduce_sum.hpp | 79 +++++------- stan/math/rev/scal/functor/reduce_sum.hpp | 114 +++++++++++++----- .../functor/parallel_v3_reduce_sum_test.cpp | 93 +++++++++++++- 4 files changed, 207 insertions(+), 80 deletions(-) diff --git a/stan/math/prim/meta/broadcast_array.hpp b/stan/math/prim/meta/broadcast_array.hpp index 3cc07ba97a1..78a8ba06cfb 100644 --- a/stan/math/prim/meta/broadcast_array.hpp +++ b/stan/math/prim/meta/broadcast_array.hpp @@ -37,6 +37,7 @@ class empty_broadcast_array { * Not implemented so cannot be called. */ T& operator[](int /*i*/); + T& operator[](int /*i*/) const; /** \ingroup type_trait * Not implemented so cannot be called. diff --git a/stan/math/prim/scal/functor/reduce_sum.hpp b/stan/math/prim/scal/functor/reduce_sum.hpp index 333e5ef6170..a1563ce70f7 100644 --- a/stan/math/prim/scal/functor/reduce_sum.hpp +++ b/stan/math/prim/scal/functor/reduce_sum.hpp @@ -21,32 +21,42 @@ namespace internal { // base definition => compile error template + class Arg3, class Arg4, class T_return_type> struct reduce_sum_impl {}; -template -struct reduce_sum_impl { +template +struct reduce_sum_impl { using vmapped_t = std::vector; using arg1_t = std::vector; using arg2_t = std::vector; + using arg3_t = std::vector; + using arg4_t = std::vector; struct recursive_reducer { const vmapped_t& vmapped_; const arg1_t& arg1_; const arg2_t& arg2_; + const arg3_t& arg3_; + const arg4_t& arg4_; T terms_sum_; recursive_reducer(const vmapped_t& vmapped, const T& init, - const arg1_t& arg1, const arg2_t& arg2) + const arg1_t& arg1, const arg2_t& arg2, + const arg3_t& arg3, const arg4_t& arg4, ) : vmapped_(vmapped), arg1_(arg1), arg2_(arg2), + arg3_(arg3), + arg4_(arg4), terms_sum_(value_of(init)) {} recursive_reducer(recursive_reducer& other, tbb::split) : vmapped_(other.vmapped_), arg1_(other.arg1_), arg2_(other.arg2_), + arg3_(other.arg3_), + arg4_(other.arg4_), terms_sum_(0.0) {} void operator()(const tbb::blocked_range& r) { @@ -58,10 +68,10 @@ struct reduce_sum_impl { auto end = vmapped_.begin(); std::advance(end, r.end()); - vmapped_t sub_slice(start, end); + const vmapped_t sub_slice(start, end); - terms_sum_ - += ReduceFunction()(r.begin(), r.end() - 1, sub_slice, arg1_, arg2_); + terms_sum_ += ReduceFunction()(r.begin(), r.end() - 1, sub_slice, arg1_, + arg2_, arg3_, arg4_); } void join(const recursive_reducer& child) { @@ -70,9 +80,10 @@ struct reduce_sum_impl { }; T operator()(const vmapped_t& vmapped, T init, std::size_t grainsize, - const arg1_t& arg1, const arg2_t& arg2) const { + const arg1_t& arg1, const arg2_t& arg2, const arg3_t& arg3, + const arg4_t& arg4) const { const std::size_t num_jobs = vmapped.size(); - recursive_reducer worker(vmapped, init, arg1, arg2); + recursive_reducer worker(vmapped, init, arg1, arg2, arg3, arg4); tbb::parallel_reduce( tbb::blocked_range(0, num_jobs, grainsize), worker); return std::move(worker.terms_sum_); @@ -81,54 +92,22 @@ struct reduce_sum_impl { } // namespace internal /* - * The ReduceFunction is expected to have an operator with the - * signature - * - * inline T operator()(std::size_t start, std::size_t end, - * std::vector& sub_slice, std::vector& lambda, - * const std::vector& gidx) const - * - * So the input data is sliced into smaller pieces and given into the - * reducer. Which exact sub-slice is given to the function is defined - * by start and end index. These can be used to map data to groups, - * for example. It would be good to extend this signature in two ways: - * - * 1. The large std::vector which is sliced into smaller pieces should - * be allowed to be any data (so int, double, vectors, matrices, - * ...). In a second step this should also be extended to vars - * which will require more considerations, but is beneficial. One - * could leave this generalization to a future version. - * - * 2. The arguments past sub_slice should be variable. So the - * arguments past sub_slice would need to be represented by the - * "..." thing in C++. The complication for this is managing all - * the calls. - * - * So the reduce_sum signature should look like - * - * template - * constexpr T reduce_sum(InputIt first, InputIt last, T init, - * std::size_t grainsize, ...) - * - * which corresponds to a ReduceFunction which looks like - * - * inline T operator()(std::size_t start, std::size_t end, - * std::vector& sub_slice, ...) - * * Note that the ReduceFunction is only passed in as type to prohibit * that any internal state of the functor is causing trouble. Thus, * the functor must be default constructible without any arguments. * */ -template +template constexpr T reduce_sum(const std::vector& vmapped, T init, - std::size_t grainsize, - const std::vector& arg1, - const std::vector& arg2) { + std::size_t grainsize, const std::vector& arg1, + const std::vector& arg2, + const std::vector& arg3, + const std::vector& arg4) { typedef T return_base_t; - return internal::reduce_sum_impl()( - vmapped, init, grainsize, arg1, arg2); + return internal::reduce_sum_impl()(vmapped, init, grainsize, + arg1, arg2, arg3, arg4); } } // namespace math diff --git a/stan/math/rev/scal/functor/reduce_sum.hpp b/stan/math/rev/scal/functor/reduce_sum.hpp index df59c1d458b..f0d26989a69 100644 --- a/stan/math/rev/scal/functor/reduce_sum.hpp +++ b/stan/math/rev/scal/functor/reduce_sum.hpp @@ -22,10 +22,28 @@ struct child_type { using type = int; }; +template +typename child_type::type adjoint_of(const T x) { + return typename child_type::type(0); +} + +template +std::vector::type> adjoint_of(const std::vector& x) { + const size_t size = x.size(); + std::vector::type> result(size); + for (size_t i = 0; i != size; ++i) { + result[i] = adjoint_of(x[i]); + } + return result; +} + +double adjoint_of(const var& x) { return x.vi_->adj_; } + namespace internal { -template -struct reduce_sum_impl { +template +struct reduce_sum_impl { using vmapped_t = std::vector; using arg1_t = std::vector; using arg2_t = std::vector; @@ -33,9 +51,12 @@ struct reduce_sum_impl { using vmapped_value_t = std::vector::type>; using arg1_value_t = std::vector::type>; using arg2_value_t = std::vector::type>; + using arg3_value_t = std::vector::type>; + using arg4_value_t = std::vector::type>; using ops_partials_t - = operands_and_partials, std::vector>; + = operands_and_partials, std::vector, + std::vector, std::vector>; struct recursive_reducer { const vmapped_t& vmapped_; @@ -43,6 +64,10 @@ struct reduce_sum_impl { const arg1_value_t& arg1_value_; const arg2_t& arg2_; const arg2_value_t& arg2_value_; + const arg3_t& arg3_; + const arg3_value_t& arg3_value_; + const arg4_t& arg4_; + const arg4_value_t& arg4_value_; ops_partials_t& terms_partials_mapped_; ops_partials_t terms_partials_args_; @@ -51,14 +76,20 @@ struct reduce_sum_impl { recursive_reducer(const vmapped_t& vmapped, const T& init, const arg1_t& arg1, const arg1_value_t& arg1_value, const arg2_t& arg2, const arg2_value_t& arg2_value, + const arg3_t& arg3, const arg3_value_t& arg3_value, + const arg4_t& arg4, const arg4_value_t& arg4_value, ops_partials_t& terms_partials_mapped) : vmapped_(vmapped), arg1_(arg1), arg1_value_(arg1_value), arg2_(arg2), arg2_value_(arg2_value), + arg3_(arg3), + arg3_value_(arg3_value), + arg4_(arg4), + arg4_value_(arg4_value), terms_partials_mapped_(terms_partials_mapped), - terms_partials_args_(vmapped, arg1, arg2), + terms_partials_args_(vmapped, arg1, arg2, arg3, arg4), terms_sum_(value_of(init)) {} recursive_reducer(recursive_reducer& other, tbb::split) @@ -67,8 +98,12 @@ struct reduce_sum_impl { arg1_value_(other.arg1_value_), arg2_(other.arg2_), arg2_value_(other.arg2_value_), + arg3_(other.arg3_), + arg3_value_(other.arg3_value_), + arg4_(other.arg4_), + arg4_value_(other.arg4_value_), terms_partials_mapped_(other.terms_partials_mapped_), - terms_partials_args_(vmapped_, arg1_, arg2_), + terms_partials_args_(vmapped_, arg1_, arg2_, arg3_, arg4_), terms_sum_(0.0) {} void operator()(const tbb::blocked_range& r) { @@ -107,37 +142,45 @@ struct reduce_sum_impl { // const std::vector varg1_d = value_of(varg1_); // std::vector varg1(varg1_d.begin(), varg1_d.end()); - arg1_t local_arg1(arg1_value_.begin(), arg1_value_.end()); - arg2_t local_arg2(arg2_value_.begin(), arg2_value_.end()); + const arg1_t local_arg1(arg1_value_.begin(), arg1_value_.end()); + const arg2_t local_arg2(arg2_value_.begin(), arg2_value_.end()); + const arg3_t local_arg3(arg3_value_.begin(), arg3_value_.end()); + const arg4_t local_arg4(arg4_value_.begin(), arg4_value_.end()); - T sub_sum_v = ReduceFunction()(r.begin(), r.end() - 1, local_sub_slice, - local_arg1, local_arg2); + T sub_sum_v + = ReduceFunction()(r.begin(), r.end() - 1, local_sub_slice, + local_arg1, local_arg2, local_arg3, local_arg4); sub_sum_v.grad(); terms_sum_ += sub_sum_v.val(); - /* if (!is_constant_all::value) { + vmapped_value_t sub_slice_adjoint = adjoint_of(local_sub_slice); for (std::size_t i = 0; i != r.size(); ++i) terms_partials_mapped_.edge1_.partials_[r.begin() + i] - += local_sub_slice[i].adj(); + += sub_slice_adjoint[i]; } - */ - - /**/ if (!is_constant_all::value) { + arg1_value_t arg1_adjoint = adjoint_of(local_arg1); for (std::size_t i = 0; i != local_arg1.size(); ++i) - terms_partials_args_.edge2_.partials_[i] += local_arg1[i].adj(); + terms_partials_args_.edge2_.partials_[i] += arg1_adjoint[i]; } - /**/ - - /* if (!is_constant_all::value) { + arg2_value_t arg2_adjoint = adjoint_of(local_arg2); for (std::size_t i = 0; i != local_arg2.size(); ++i) - terms_partials_args_.edge3_.partials_[i] += local_arg2[i].adj(); + terms_partials_args_.edge3_.partials_[i] += arg2_adjoint[i]; + } + if (!is_constant_all::value) { + arg3_value_t arg3_adjoint = adjoint_of(local_arg1); + for (std::size_t i = 0; i != local_arg3.size(); ++i) + terms_partials_args_.edge4_.partials_[i] += arg3_adjoint[i]; + } + if (!is_constant_all::value) { + arg4_value_t arg4_adjoint = adjoint_of(local_arg4); + for (std::size_t i = 0; i != local_arg2.size(); ++i) + terms_partials_args_.edge5_.partials_[i] += arg2_adjoint[i]; } - */ } catch (const std::exception& e) { recover_memory_nested(); @@ -149,34 +192,42 @@ struct reduce_sum_impl { void join(const recursive_reducer& child) { terms_sum_ += child.terms_sum_; - /**/ if (!is_constant_all::value) { for (std::size_t i = 0; i != arg1_.size(); ++i) terms_partials_args_.edge2_.partials_[i] += child.terms_partials_args_.edge2_.partials_[i]; } - /**/ - - /* if (!is_constant_all::value) { for (std::size_t i = 0; i != arg2_.size(); ++i) terms_partials_args_.edge3_.partials_[i] += child.terms_partials_args_.edge3_.partials_[i]; } - */ + if (!is_constant_all::value) { + for (std::size_t i = 0; i != arg3_.size(); ++i) + terms_partials_args_.edge4_.partials_[i] + += child.terms_partials_args_.edge4_.partials_[i]; + } + if (!is_constant_all::value) { + for (std::size_t i = 0; i != arg4_.size(); ++i) + terms_partials_args_.edge5_.partials_[i] + += child.terms_partials_args_.edge5_.partials_[i]; + } } }; T operator()(const vmapped_t& vmapped, T init, std::size_t grainsize, - const arg1_t& arg1, const arg2_t& arg2) const { + const arg1_t& arg1, const arg2_t& arg2, const arg3_t& arg3, + const arg4_t& arg4) const { const std::size_t num_jobs = vmapped.size(); arg1_value_t arg1_value = value_of(arg1); arg2_value_t arg2_value = value_of(arg2); + arg3_value_t arg3_value = value_of(arg3); + arg4_value_t arg4_value = value_of(arg4); - ops_partials_t ops(vmapped, arg1, arg2); + ops_partials_t ops(vmapped, arg1, arg2, arg3, arg4); recursive_reducer worker(vmapped, init, arg1, arg1_value, arg2, arg2_value, - ops); + arg3, arg3_value, arg4, arg4_value, ops); tbb::parallel_reduce( tbb::blocked_range(0, num_jobs, grainsize), worker); @@ -184,10 +235,15 @@ struct reduce_sum_impl { if (!is_constant_all::value) { ops.edge2_.partials_ = worker.terms_partials_args_.edge2_.partials_; } - if (!is_constant_all::value) { ops.edge3_.partials_ = worker.terms_partials_args_.edge3_.partials_; } + if (!is_constant_all::value) { + ops.edge4_.partials_ = worker.terms_partials_args_.edge4_.partials_; + } + if (!is_constant_all::value) { + ops.edge5_.partials_ = worker.terms_partials_args_.edge5_.partials_; + } return ops.build(worker.terms_sum_); } diff --git a/test/unit/math/rev/scal/functor/parallel_v3_reduce_sum_test.cpp b/test/unit/math/rev/scal/functor/parallel_v3_reduce_sum_test.cpp index d879aa04d21..f12c87651b9 100644 --- a/test/unit/math/rev/scal/functor/parallel_v3_reduce_sum_test.cpp +++ b/test/unit/math/rev/scal/functor/parallel_v3_reduce_sum_test.cpp @@ -208,7 +208,8 @@ struct grouped_count_lpdf { // does the reduction in the sub-slice start to end inline T operator()(std::size_t start, std::size_t end, - std::vector& sub_slice, std::vector& lambda, + const std::vector& sub_slice, + const std::vector& lambda, const std::vector& gidx) const { const std::size_t num_terms = end - start + 1; // std::cout << "sub-slice " << start << " - " << end << "; num_terms = " << @@ -280,3 +281,93 @@ TEST(v3_reduce_sum, grouped_gradient) { ; stan::math::recover_memory(); } + +// ******************************** +// slice over the grouping variable which is a var +// ******************************** + +template +struct slice_group_count_lpdf { + slice_group_count_lpdf() {} + + // does the reduction in the sub-slice start to end + inline T operator()(std::size_t start, std::size_t end, + const std::vector& lambda_slice, + const std::vector& y, + const std::vector& gsidx) const { + const std::size_t num_groups = end - start + 1; + T result = 0.0; + for (std::size_t i = 0; i != num_groups; ++i) { + std::vector y_group(y.begin() + gsidx[start + i], + y.begin() + gsidx[start + i + 1]); + result += stan::math::poisson_lpmf(y_group, lambda_slice[i]); + } + return result; + } +}; + +TEST(v3_reduce_sum, slice_group_gradient) { + stan::math::init_threadpool_tbb(); + + double lambda_d = 10.0; + const std::size_t groups = 10; + const std::size_t elems_per_group = 1000; + const std::size_t elems = groups * elems_per_group; + const std::size_t num_iter = 1000; + + std::vector data(elems); + std::vector gidx(elems); + std::vector gsidx(groups + 1); + + for (std::size_t i = 0, k = 0; i != groups; ++i) { + gsidx[i] = k; + for (std::size_t j = 0; j != elems_per_group; ++j, ++k) { + data[k] = k; + gidx[k] = i; + } + gsidx[i + 1] = k; + } + + using stan::math::var; + + std::vector vlambda_v; + + for (std::size_t i = 0; i != groups; ++i) + vlambda_v.push_back(i + 0.2); + + var lambda_v = vlambda_v[0]; + + var poisson_lpdf = stan::math::reduce_sum>( + vlambda_v, var(0.0), 5, data, gsidx); + + std::vector vref_lambda_v; + for (std::size_t i = 0; i != elems; ++i) { + vref_lambda_v.push_back(vlambda_v[gidx[i]]); + } + var lambda_ref = vlambda_v[0]; + + var poisson_lpdf_ref = stan::math::poisson_lpmf(data, vref_lambda_v); + + EXPECT_FLOAT_EQ(value_of(poisson_lpdf), value_of(poisson_lpdf_ref)); + + stan::math::grad(poisson_lpdf_ref.vi_); + const double lambda_ref_adj = lambda_ref.adj(); + + stan::math::set_zero_all_adjoints(); + stan::math::grad(poisson_lpdf.vi_); + const double lambda_adj = lambda_v.adj(); + + EXPECT_FLOAT_EQ(lambda_adj, lambda_ref_adj); + + std::cout << "ref value of poisson lpdf : " << poisson_lpdf_ref.val() + << std::endl; + ; + std::cout << "ref gradient wrt to lambda: " << lambda_ref_adj << std::endl; + ; + + std::cout << "value of poisson lpdf : " << poisson_lpdf.val() << std::endl; + ; + std::cout << "gradient wrt to lambda: " << lambda_adj << std::endl; + ; + stan::math::recover_memory(); +} From bd1390701ea91858cb5177baf1101d139c6bdd72 Mon Sep 17 00:00:00 2001 From: Sebastian Weber Date: Sun, 5 Jan 2020 15:52:30 +0100 Subject: [PATCH 018/189] make arguments optional --- stan/math/prim/scal/functor/reduce_sum.hpp | 124 ++++++++++++++++++++- stan/math/rev/scal/functor/reduce_sum.hpp | 6 +- 2 files changed, 124 insertions(+), 6 deletions(-) diff --git a/stan/math/prim/scal/functor/reduce_sum.hpp b/stan/math/prim/scal/functor/reduce_sum.hpp index a1563ce70f7..6aa48d1d704 100644 --- a/stan/math/prim/scal/functor/reduce_sum.hpp +++ b/stan/math/prim/scal/functor/reduce_sum.hpp @@ -43,7 +43,7 @@ struct reduce_sum_impl { recursive_reducer(const vmapped_t& vmapped, const T& init, const arg1_t& arg1, const arg2_t& arg2, - const arg3_t& arg3, const arg4_t& arg4, ) + const arg3_t& arg3, const arg4_t& arg4) : vmapped_(vmapped), arg1_(arg1), arg2_(arg2), @@ -89,6 +89,76 @@ struct reduce_sum_impl { return std::move(worker.terms_sum_); } }; + +// default gives a compile time error +template +struct reduce_function_arg_adapter {}; + +template +struct reduce_function_arg_adapter { + template + inline auto operator()(std::size_t start, std::size_t end, + const std::vector& vmapped_slice, + const std::vector& arg1, + const std::vector& arg2, + const std::vector& arg3, + const std::vector& arg4) { + return ReduceFunction()(start, end, vmapped_slice, arg1, arg2, arg3, arg4); + } +}; + +template +struct reduce_function_arg_adapter { + template + inline auto operator()(std::size_t start, std::size_t end, + const std::vector& vmapped_slice, + const std::vector& arg1, + const std::vector& arg2, + const std::vector& arg3, + const std::vector& arg4) { + return ReduceFunction()(start, end, vmapped_slice, arg1, arg2, arg3); + } +}; + +template +struct reduce_function_arg_adapter { + template + inline auto operator()(std::size_t start, std::size_t end, + const std::vector& vmapped_slice, + const std::vector& arg1, + const std::vector& arg2, + const std::vector& arg3, + const std::vector& arg4) { + return ReduceFunction()(start, end, vmapped_slice, arg1, arg2); + } +}; + +template +struct reduce_function_arg_adapter { + template + inline auto operator()(std::size_t start, std::size_t end, + const std::vector& vmapped_slice, + const std::vector& arg1, + const std::vector& arg2, + const std::vector& arg3, + const std::vector& arg4) { + return ReduceFunction()(start, end, vmapped_slice, arg1); + } +}; + +template +struct reduce_function_arg_adapter { + template + inline auto operator()(std::size_t start, std::size_t end, + const std::vector& vmapped_slice, + const std::vector& arg1, + const std::vector& arg2, + const std::vector& arg3, + const std::vector& arg4) { + return ReduceFunction()(start, end, vmapped_slice); + } +}; + } // namespace internal /* @@ -105,9 +175,55 @@ constexpr T reduce_sum(const std::vector& vmapped, T init, const std::vector& arg3, const std::vector& arg4) { typedef T return_base_t; - return internal::reduce_sum_impl()(vmapped, init, grainsize, - arg1, arg2, arg3, arg4); + return internal::reduce_sum_impl< + internal::reduce_function_arg_adapter, M, T, Arg1, + Arg2, Arg3, Arg4, return_base_t>()(vmapped, init, grainsize, arg1, arg2, + arg3, arg4); +} + +template +constexpr T reduce_sum(const std::vector& vmapped, T init, + std::size_t grainsize, const std::vector& arg1, + const std::vector& arg2, + const std::vector& arg3) { + typedef T return_base_t; + return internal::reduce_sum_impl< + internal::reduce_function_arg_adapter, M, T, Arg1, + Arg2, Arg3, int, return_base_t>()(vmapped, init, grainsize, arg1, arg2, + arg3, std::vector()); +} + +template +constexpr T reduce_sum(const std::vector& vmapped, T init, + std::size_t grainsize, const std::vector& arg1, + const std::vector& arg2) { + typedef T return_base_t; + return internal::reduce_sum_impl< + internal::reduce_function_arg_adapter, M, T, Arg1, + Arg2, int, int, return_base_t>()(vmapped, init, grainsize, arg1, arg2, + std::vector(), std::vector()); +} +template +constexpr T reduce_sum(const std::vector& vmapped, T init, + std::size_t grainsize, const std::vector& arg1) { + typedef T return_base_t; + return internal::reduce_sum_impl< + internal::reduce_function_arg_adapter, M, T, Arg1, int, + int, int, return_base_t>()(vmapped, init, grainsize, arg1, + std::vector(), std::vector(), + std::vector()); +} + +template +constexpr T reduce_sum(const std::vector& vmapped, T init, + std::size_t grainsize) { + typedef T return_base_t; + return internal::reduce_sum_impl< + internal::reduce_function_arg_adapter, M, T, int, int, + int, int, return_base_t>()(vmapped, init, grainsize, std::vector(), + std::vector(), std::vector(), + std::vector()); } } // namespace math diff --git a/stan/math/rev/scal/functor/reduce_sum.hpp b/stan/math/rev/scal/functor/reduce_sum.hpp index f0d26989a69..943e17996bb 100644 --- a/stan/math/rev/scal/functor/reduce_sum.hpp +++ b/stan/math/rev/scal/functor/reduce_sum.hpp @@ -47,6 +47,8 @@ struct reduce_sum_impl { using vmapped_t = std::vector; using arg1_t = std::vector; using arg2_t = std::vector; + using arg3_t = std::vector; + using arg4_t = std::vector; using vmapped_value_t = std::vector::type>; using arg1_value_t = std::vector::type>; @@ -172,14 +174,14 @@ struct reduce_sum_impl { terms_partials_args_.edge3_.partials_[i] += arg2_adjoint[i]; } if (!is_constant_all::value) { - arg3_value_t arg3_adjoint = adjoint_of(local_arg1); + arg3_value_t arg3_adjoint = adjoint_of(local_arg3); for (std::size_t i = 0; i != local_arg3.size(); ++i) terms_partials_args_.edge4_.partials_[i] += arg3_adjoint[i]; } if (!is_constant_all::value) { arg4_value_t arg4_adjoint = adjoint_of(local_arg4); for (std::size_t i = 0; i != local_arg2.size(); ++i) - terms_partials_args_.edge5_.partials_[i] += arg2_adjoint[i]; + terms_partials_args_.edge5_.partials_[i] += arg4_adjoint[i]; } } catch (const std::exception& e) { From e1deb4fd1439c2d05de7fd265e506702aa77e441 Mon Sep 17 00:00:00 2001 From: Sebastian Weber Date: Sun, 5 Jan 2020 16:01:07 +0100 Subject: [PATCH 019/189] more doc and const declares --- stan/math/prim/scal/functor/reduce_sum.hpp | 19 +++++++++++++------ stan/math/rev/scal/functor/reduce_sum.hpp | 8 ++++---- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/stan/math/prim/scal/functor/reduce_sum.hpp b/stan/math/prim/scal/functor/reduce_sum.hpp index 6aa48d1d704..e1ac9e6c625 100644 --- a/stan/math/prim/scal/functor/reduce_sum.hpp +++ b/stan/math/prim/scal/functor/reduce_sum.hpp @@ -90,6 +90,14 @@ struct reduce_sum_impl { } }; +/* The reduce_function_arg_adapter adapts the user functor to + * conventions used in the reduce_sum functions. While the user + * functor is allowed to specify optionally up to 4 additional + * arguments, the reduce_sum functions always expect 4 + * arguments. Thus, the call of the reduce_sum functions with 4 + * arguments get adapted to the user functors by dropping the + * arguments which are not used. + */ // default gives a compile time error template struct reduce_function_arg_adapter {}; @@ -102,7 +110,7 @@ struct reduce_function_arg_adapter { const std::vector& arg1, const std::vector& arg2, const std::vector& arg3, - const std::vector& arg4) { + const std::vector& arg4) const { return ReduceFunction()(start, end, vmapped_slice, arg1, arg2, arg3, arg4); } }; @@ -115,7 +123,7 @@ struct reduce_function_arg_adapter { const std::vector& arg1, const std::vector& arg2, const std::vector& arg3, - const std::vector& arg4) { + const std::vector& arg4) const { return ReduceFunction()(start, end, vmapped_slice, arg1, arg2, arg3); } }; @@ -128,7 +136,7 @@ struct reduce_function_arg_adapter { const std::vector& arg1, const std::vector& arg2, const std::vector& arg3, - const std::vector& arg4) { + const std::vector& arg4) const { return ReduceFunction()(start, end, vmapped_slice, arg1, arg2); } }; @@ -141,7 +149,7 @@ struct reduce_function_arg_adapter { const std::vector& arg1, const std::vector& arg2, const std::vector& arg3, - const std::vector& arg4) { + const std::vector& arg4) const { return ReduceFunction()(start, end, vmapped_slice, arg1); } }; @@ -154,7 +162,7 @@ struct reduce_function_arg_adapter { const std::vector& arg1, const std::vector& arg2, const std::vector& arg3, - const std::vector& arg4) { + const std::vector& arg4) const { return ReduceFunction()(start, end, vmapped_slice); } }; @@ -165,7 +173,6 @@ struct reduce_function_arg_adapter { * Note that the ReduceFunction is only passed in as type to prohibit * that any internal state of the functor is causing trouble. Thus, * the functor must be default constructible without any arguments. - * */ template diff --git a/stan/math/rev/scal/functor/reduce_sum.hpp b/stan/math/rev/scal/functor/reduce_sum.hpp index 943e17996bb..d90ad18769a 100644 --- a/stan/math/rev/scal/functor/reduce_sum.hpp +++ b/stan/math/rev/scal/functor/reduce_sum.hpp @@ -221,10 +221,10 @@ struct reduce_sum_impl { const arg1_t& arg1, const arg2_t& arg2, const arg3_t& arg3, const arg4_t& arg4) const { const std::size_t num_jobs = vmapped.size(); - arg1_value_t arg1_value = value_of(arg1); - arg2_value_t arg2_value = value_of(arg2); - arg3_value_t arg3_value = value_of(arg3); - arg4_value_t arg4_value = value_of(arg4); + const arg1_value_t arg1_value = value_of(arg1); + const arg2_value_t arg2_value = value_of(arg2); + const arg3_value_t arg3_value = value_of(arg3); + const arg4_value_t arg4_value = value_of(arg4); ops_partials_t ops(vmapped, arg1, arg2, arg3, arg4); From 358525cfe3fc65188dc8ff1c9def382794f52f8b Mon Sep 17 00:00:00 2001 From: Sebastian Weber Date: Sun, 5 Jan 2020 16:07:42 +0100 Subject: [PATCH 020/189] doc --- stan/math/rev/scal/functor/reduce_sum.hpp | 26 +++-------------------- 1 file changed, 3 insertions(+), 23 deletions(-) diff --git a/stan/math/rev/scal/functor/reduce_sum.hpp b/stan/math/rev/scal/functor/reduce_sum.hpp index d90ad18769a..801938bd5f8 100644 --- a/stan/math/rev/scal/functor/reduce_sum.hpp +++ b/stan/math/rev/scal/functor/reduce_sum.hpp @@ -115,35 +115,15 @@ struct reduce_sum_impl { try { start_nested(); - // elem_t should better be a non-var!!! Otherwise we interfere - // with the outer AD tree as we mess with the adjoints (and - // right now we discard that it is a var if it would be) + // create a deep copy of all var's so that these are not + // linked to any outer AD tree + vmapped_t local_sub_slice; local_sub_slice.reserve(r.size()); for (std::size_t i = r.begin(); i != r.end(); ++i) local_sub_slice.emplace_back(value_of(vmapped_[i])); - // create a deep copy of arg1 which is not tied to the outer - // AD tree - // todo: these copies can be put onto a thread-local storage - // object such that we only create these once per thread which - // should significantly boost performance as many copies are - // avoided... this comes at the cost that the adjoints have to - // be zeroed "manually" - /* using the var_nochain_stack_ tape avoids up-propagation of chain - std::vector varg1; - varg1.reserve(varg1_.size()); - - for (Arg1& elem : varg1_) - varg1.emplace_back(var(new vari(elem.val(), false))); - */ - - // but this should actually do the same if we simply - // instantiate the var from a double representation - // const std::vector varg1_d = value_of(varg1_); - // std::vector varg1(varg1_d.begin(), varg1_d.end()); - const arg1_t local_arg1(arg1_value_.begin(), arg1_value_.end()); const arg2_t local_arg2(arg2_value_.begin(), arg2_value_.end()); const arg3_t local_arg3(arg3_value_.begin(), arg3_value_.end()); From 5f76ca23ba48b5487bc87f6a4379bb8b4926eea8 Mon Sep 17 00:00:00 2001 From: Sebastian Weber Date: Mon, 6 Jan 2020 22:15:24 +0100 Subject: [PATCH 021/189] generalize possible input data structures --- stan/math/prim/meta/broadcast_array.hpp | 8 + stan/math/prim/meta/operands_and_partials.hpp | 1 + stan/math/rev/scal/functor/reduce_sum.hpp | 263 +++++++++++++++--- 3 files changed, 227 insertions(+), 45 deletions(-) diff --git a/stan/math/prim/meta/broadcast_array.hpp b/stan/math/prim/meta/broadcast_array.hpp index 78a8ba06cfb..d71a2159380 100644 --- a/stan/math/prim/meta/broadcast_array.hpp +++ b/stan/math/prim/meta/broadcast_array.hpp @@ -16,6 +16,7 @@ class broadcast_array { explicit broadcast_array(T& prim) : prim_(prim) {} T& operator[](int /*i*/) { return prim_; } + T& operator[](int /*i*/) const { return prim_; } /** \ingroup type_trait * We can assign any right hand side which allows for indexing to a @@ -27,6 +28,10 @@ class broadcast_array { void operator=(const Y& m) { prim_ = m[0]; } + broadcast_array& operator=(broadcast_array other) { + prim_ = other.prim_; + return *this; + } }; template @@ -44,6 +49,9 @@ class empty_broadcast_array { */ template void operator=(const Y& /*A*/); + + template + void operator+=(R); }; template diff --git a/stan/math/prim/meta/operands_and_partials.hpp b/stan/math/prim/meta/operands_and_partials.hpp index 5b312179430..470c218259f 100644 --- a/stan/math/prim/meta/operands_and_partials.hpp +++ b/stan/math/prim/meta/operands_and_partials.hpp @@ -38,6 +38,7 @@ template class ops_partials_edge { public: empty_broadcast_array partials_; + empty_broadcast_array partials_vec_; ops_partials_edge() {} explicit ops_partials_edge(const Op& /* op */) {} diff --git a/stan/math/rev/scal/functor/reduce_sum.hpp b/stan/math/rev/scal/functor/reduce_sum.hpp index 801938bd5f8..aa589df4e34 100644 --- a/stan/math/rev/scal/functor/reduce_sum.hpp +++ b/stan/math/rev/scal/functor/reduce_sum.hpp @@ -4,8 +4,6 @@ #include #include -#include - #include #include #include @@ -17,20 +15,42 @@ namespace stan { namespace math { +template +struct value_type { + using type = double; +}; + template <> -struct child_type { +struct value_type { using type = int; }; +template