Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/develop' into feature/vectorized…
Browse files Browse the repository at this point in the history
…-binary-funcs1-varmat
  • Loading branch information
SteveBronder committed Jun 22, 2021
2 parents 3fb1507 + 8eedfb8 commit d309731
Show file tree
Hide file tree
Showing 52 changed files with 1,149 additions and 82 deletions.
2 changes: 1 addition & 1 deletion make/tests
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ else
endif

%.hpp-test : %.hpp test/dummy.cpp
$(COMPILE.cpp) $(CXXFLAGS) -O0 -include $^ -o $(DEV_NULL)
$(COMPILE.cpp) $(CXXFLAGS) -O0 -include $^ -o $(DEV_NULL) -Wunused-local-typedefs

test/dummy.cpp:
@mkdir -p test
Expand Down
1 change: 0 additions & 1 deletion stan/math/fwd/fun/mdivide_left.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,6 @@ template <typename T1, typename T2,
inline Eigen::Matrix<value_type_t<T2>, T1::RowsAtCompileTime,
T2::ColsAtCompileTime>
mdivide_left(const T1& A, const T2& b) {
using T = typename value_type_t<T2>::Scalar;
constexpr int S1 = T1::RowsAtCompileTime;
constexpr int C2 = T2::ColsAtCompileTime;

Expand Down
1 change: 0 additions & 1 deletion stan/math/fwd/fun/mdivide_left_tri_low.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,6 @@ template <typename T1, typename T2, require_eigen_t<T1>* = nullptr,
inline Eigen::Matrix<value_type_t<T2>, T1::RowsAtCompileTime,
T2::ColsAtCompileTime>
mdivide_left_tri_low(const T1& A, const T2& b) {
using T = typename value_type_t<T2>::Scalar;
constexpr int S1 = T1::RowsAtCompileTime;
constexpr int C2 = T2::ColsAtCompileTime;

Expand Down
224 changes: 224 additions & 0 deletions stan/math/opencl/kernels/cumulative_sum.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,224 @@
#ifndef STAN_MATH_OPENCL_KERNELS_CUMULATIVE_SUM_HPP
#define STAN_MATH_OPENCL_KERNELS_CUMULATIVE_SUM_HPP
#ifdef STAN_OPENCL

#include <stan/math/opencl/kernel_cl.hpp>
#include <stan/math/opencl/buffer_types.hpp>
#include <stan/math/opencl/matrix_cl_view.hpp>
#include <string>

namespace stan {
namespace math {
namespace opencl_kernels {

// \cond
static const char *cumulative_sum1_kernel_code = STRINGIFY(
// \endcond
/** \ingroup opencl_kernels
* First kernel of the cumulative sum implementation. Each thread sums the
* assigned elements and threads within same work group add their results
* together.
*
* @param[out] out_wgs results from each work group
* @param[out] out_threads results for each thread
* @param[in] in input data
* @param size size number of elements in the input
*/
__kernel void cumulative_sum1(__global SCAL *out_wgs,
__global SCAL *out_threads, __global SCAL *in,
int size) {
const int gid = get_global_id(0);
const int lid = get_local_id(0);
const int lsize = get_local_size(0);
const int wg_id = get_group_id(0);
const int gsize = get_global_size(0);

int start = (int)((long)gid * size / gsize); // NOLINT
int end = (int)((long)(gid + 1) * size / gsize); // NOLINT
__local SCAL local_storage[LOCAL_SIZE_];

SCAL acc = 0;
if (start != end) {
acc = in[start];
for (int i = start + 1; i < end; i++) {
acc += in[i];
}
}
for (int step = 1; step < lsize; step *= REDUCTION_STEP_SIZE) {
local_storage[lid] = acc;
barrier(CLK_LOCAL_MEM_FENCE);
for (int i = 1; i < REDUCTION_STEP_SIZE && step * i <= lid; i++) {
acc += local_storage[lid - step * i];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
out_threads[gid] = acc;
if (lid == LOCAL_SIZE_ - 1) {
out_wgs[wg_id] = acc;
}
}
// \cond
);
// \endcond

// \cond
static const char *cumulative_sum2_kernel_code = STRINGIFY(
// \endcond
/** \ingroup opencl_kernels
* Second kernel of the cumulative sum implementation. Calculates prefix sum
* of given data in place using a single work group (must be run with a
* single work group).
*
* @param[in, out] data data to calculate cumulative sum of
* @param size size number of elements in the input
*/
__kernel void cumulative_sum2(__global SCAL *data, int size) {
const int gid = get_global_id(0);
const int gsize = get_global_size(0);

int start = (int)((long)gid * size / gsize); // NOLINT
int end = (int)((long)(gid + 1) * size / gsize); // NOLINT
__local SCAL local_storage[LOCAL_SIZE_];

SCAL acc;
if (start == end) {
acc = 0;
} else {
acc = data[start];
for (int i = start + 1; i < end; i++) {
acc += data[i];
}
}
local_storage[gid] = acc;
barrier(CLK_LOCAL_MEM_FENCE);
for (int step = 1; step < gsize; step *= REDUCTION_STEP_SIZE) {
for (int i = 1; i < REDUCTION_STEP_SIZE && step * i <= gid; i++) {
acc += local_storage[gid - step * i];
}
barrier(CLK_LOCAL_MEM_FENCE);
local_storage[gid] = acc;
barrier(CLK_LOCAL_MEM_FENCE);
}
if (start != end) {
if (gid == 0) {
acc = 0;
} else {
acc = local_storage[gid - 1];
}
for (int i = start; i < end; i++) {
acc += data[i];
data[i] = acc;
}
}
}
// \cond
);
// \endcond

// \cond
static const char *cumulative_sum3_kernel_code = STRINGIFY(
// \endcond
/** \ingroup opencl_kernels
* Third kernel of the cumulative sum implementation. Given sums of threads
* and cumulative sum of those calculates cumulative sum of given array.
* Must be run with the same number of threads and work groups as the first
* cumulative sum kernel.
*
* @param[out] out cumulatively summed input
* @param[out] in_data input data
* @param[in] in_threads summed results from each thread from the first
* kernel
* @param[in] in_wgs cumulatively summed results from each work group
* (calculated by previous two kernels)
* @param size size number of elements in the input
*/
__kernel void cumulative_sum3(__global SCAL *out, __global SCAL *in_data,
__global SCAL *in_threads,
__global SCAL *in_wgs, int size) {
const int gid = get_global_id(0);
const int lid = get_local_id(0);
const int lsize = get_local_size(0);
const int wg_id = get_group_id(0);
const int gsize = get_global_size(0);

int start = (int)((long)gid * size / gsize); // NOLINT
int end = (int)((long)(gid + 1) * size / gsize); // NOLINT
__local SCAL local_storage[LOCAL_SIZE_];

SCAL acc = 0;
if (wg_id != 0) {
acc = in_wgs[wg_id - 1];
}
if (lid != 0) {
acc += in_threads[gid - 1];
}
for (int i = start; i < end; i++) {
acc += in_data[i];
out[i] = acc;
}
}
// \cond
);
// \endcond

/**
* struct containing cumulative_sum kernels, grouped by scalar type.
*/
template <typename Scalar, typename = void>
struct cumulative_sum {};

template <typename T>
struct cumulative_sum<double, T> {
static const kernel_cl<out_buffer, out_buffer, in_buffer, int> kernel1;
static const kernel_cl<in_out_buffer, int> kernel2;
static const kernel_cl<out_buffer, in_buffer, in_buffer, in_buffer, int>
kernel3;
};
template <typename T>
struct cumulative_sum<int, T> {
static const kernel_cl<out_buffer, out_buffer, in_buffer, int> kernel1;
static const kernel_cl<in_out_buffer, int> kernel2;
static const kernel_cl<out_buffer, in_buffer, in_buffer, in_buffer, int>
kernel3;
};

template <typename T>
const kernel_cl<out_buffer, out_buffer, in_buffer, int>
cumulative_sum<double, T>::kernel1("cumulative_sum1",
{"#define SCAL double\n",
cumulative_sum1_kernel_code},
{{"REDUCTION_STEP_SIZE", 4},
{"LOCAL_SIZE_", 16}});
template <typename T>
const kernel_cl<out_buffer, out_buffer, in_buffer, int>
cumulative_sum<int, T>::kernel1(
"cumulative_sum1", {"#define SCAL int\n", cumulative_sum1_kernel_code},
{{"REDUCTION_STEP_SIZE", 4}, {"LOCAL_SIZE_", 16}});

template <typename T>
const kernel_cl<in_out_buffer, int> cumulative_sum<double, T>::kernel2(
"cumulative_sum2", {"#define SCAL double\n", cumulative_sum2_kernel_code},
{{"REDUCTION_STEP_SIZE", 4}, {"LOCAL_SIZE_", 1024}});
template <typename T>
const kernel_cl<in_out_buffer, int> cumulative_sum<int, T>::kernel2(
"cumulative_sum2", {"#define SCAL int\n", cumulative_sum2_kernel_code},
{{"REDUCTION_STEP_SIZE", 4}, {"LOCAL_SIZE_", 1024}});

template <typename T>
const kernel_cl<out_buffer, in_buffer, in_buffer, in_buffer, int>
cumulative_sum<double, T>::kernel3("cumulative_sum3",
{"#define SCAL double\n",
cumulative_sum3_kernel_code},
{{"REDUCTION_STEP_SIZE", 4},
{"LOCAL_SIZE_", 16}});
template <typename T>
const kernel_cl<out_buffer, in_buffer, in_buffer, in_buffer, int>
cumulative_sum<int, T>::kernel3(
"cumulative_sum3", {"#define SCAL int\n", cumulative_sum3_kernel_code},
{{"REDUCTION_STEP_SIZE", 4}, {"LOCAL_SIZE_", 16}});

} // namespace opencl_kernels
} // namespace math
} // namespace stan
#endif
#endif
Loading

0 comments on commit d309731

Please sign in to comment.