-
-
Notifications
You must be signed in to change notification settings - Fork 183
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add GPU Cholesky Primitive #1059
Changes from 14 commits
12e5d8e
ceee137
7528d50
8284c14
834ff4a
4c87e21
27beacc
87a67df
be110d2
287ae57
898ebca
4fdb965
36bea8b
c126ceb
6c0daff
2d00cdf
e8f9540
6b57d85
1a4a17b
36a6d4a
10c4f59
a781a96
88c26a0
cc187e6
0115c85
6bea215
b3ef66d
1c9bf89
763aca2
021d87d
0422c04
c120223
5bc8b71
774aef5
58666aa
da52c49
dc14da2
a736014
6649984
7d68799
f557e47
d71e4a6
d9675c5
775f7e4
0be4541
1e90855
d461d06
bb34aec
d7d09e7
d3bd266
1b65f2a
558f4f7
85eb0f3
907f1ec
dc24e9f
2f8be5b
89ebee8
7a146e4
fbb97a3
f594437
6c26574
d2285e0
8b04db9
6bc3eba
749e21c
1d8a707
89d1c21
9215ebc
a0b4597
dcc8c62
32dd8ca
f611fc7
15fbbe1
b704bce
08b19f0
a3f8881
96118c8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,132 @@ | ||
#ifndef STAN_MATH_GPU_CHOLESKY_DECOMPOSE_HPP | ||
#define STAN_MATH_GPU_CHOLESKY_DECOMPOSE_HPP | ||
#ifdef STAN_OPENCL | ||
#include <stan/math/gpu/matrix_gpu.hpp> | ||
#include <stan/math/gpu/kernels/cholesky_decompose.hpp> | ||
#include <stan/math/gpu/multiply.hpp> | ||
#include <stan/math/gpu/multiply_transpose.hpp> | ||
#include <stan/math/gpu/lower_tri_inverse.hpp> | ||
#include <stan/math/gpu/transpose.hpp> | ||
#include <stan/math/gpu/subtract.hpp> | ||
#include <stan/math/gpu/err/check_diagonal_zeros.hpp> | ||
#include <stan/math/gpu/err/check_nan.hpp> | ||
#include <CL/cl.hpp> | ||
#include <algorithm> | ||
namespace stan { | ||
namespace math { | ||
/** | ||
* Return the lower-triangular Cholesky factor (i.e., matrix | ||
* square root) of the specified square, symmetric matrix. | ||
* The return value \f$L\f$ will be a lower-traingular matrix such that the | ||
* original matrix \f$A\f$ is given by | ||
* <p>\f$A = L \times L^T\f$. | ||
* The Cholesky decomposition is computed on the GPU. This algorithm is | ||
* recursive, where The parameters <code>block</code>, <code>divider</code>, and | ||
* <code>min_block</code> act as tuning parameters for the recursive step of | ||
* the GPU based Cholesky decompostion. The matrix is subset by the | ||
* <code>block</code> size, and if the <code>block</code> size is less than | ||
* <code>min_block</code> then the cholesky decomposition on the GPU is computed | ||
* using that submatrix. If <code>block</code> is greater than | ||
* <code>block_size</code> then <code>cholesky_decompose</code> is run again | ||
* with <code>block</code> equal to <code>block/divider</code>. Once the | ||
* Cholesky Decomposition is computed, the full matrix cholesky is created | ||
* by propogating the cholesky forward as given in the reference report below. | ||
* | ||
* For a full guide to how this works | ||
* see the Cholesy decompostion chapter in the reference report | ||
* <a href="https://goo.gl/6kWkJ5"> here</a>. | ||
* @param A Symmetric matrix on the GPU. | ||
* @param block Size of the block used to compute the cholesky decomposition. | ||
* @param divider Proportion to divide the submatrix by at each recursive step. | ||
* @param min_block The amount that block is checked against to decide | ||
* whether to continue the recursion or to perform the cholesky. | ||
* @return Square root of matrix on the GPU. | ||
* @throw std::domain_error if m is not | ||
* positive definite (if m has more than 0 elements) | ||
*/ | ||
inline matrix_gpu cholesky_decompose(matrix_gpu& A, const int block = 100, | ||
const int divider = 2, | ||
const int min_block = 100) { | ||
auto offset = 0; | ||
// NOTE: The code in this section follows the naming conventions | ||
// in the report linked in the docs. | ||
matrix_gpu A_11(block, block); | ||
matrix_gpu L_11(block, block); | ||
// Repeats the blocked cholesky decomposition until the size of the remaining | ||
// submatrix is smaller or equal to the block size | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this comment still accurate? It seems like this while loop is not actually breaking down A into smaller subproblems but rather iterating through all of the blocks of size There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think the comment is accurate, but worded weirdly. I'll clean this up There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually reading that comment it's better to change the comment on line 108 so that it references the remaining submatrix. Then readers can get that the part here runs until the block size exceeds the size of the matrix then the part of the matrix remaining is computed below |
||
while ((offset + block) < (A.rows())) { | ||
SteveBronder marked this conversation as resolved.
Show resolved
Hide resolved
|
||
auto block_subset = A.rows() - offset - block; | ||
matrix_gpu A_21(block_subset, block); | ||
matrix_gpu A_22(block_subset, block_subset); | ||
// Copies the A_11 submatrix from the input | ||
SteveBronder marked this conversation as resolved.
Show resolved
Hide resolved
|
||
A_11.sub_block(A, offset, offset, 0, 0, block, block); | ||
// Calls the blocked cholesky for the submatrix A_11 | ||
// or calls the kernel directly if the size of the block is small enough | ||
if (block <= min_block || divider <= 1) { | ||
L_11.zeros(); | ||
try { | ||
opencl_kernels::cholesky_decompose( | ||
cl::NDRange(A_11.rows()), cl::NDRange(A_11.rows()), A_11.buffer(), | ||
L_11.buffer(), A_11.rows()); | ||
} catch (const cl::Error& e) { | ||
check_opencl_error("cholesky_decompose", e); | ||
} | ||
} else { | ||
L_11 = cholesky_decompose(A_11, block / divider, divider, min_block); | ||
} | ||
// Copies the cholesky factor of A_11 back to the input matrix | ||
SteveBronder marked this conversation as resolved.
Show resolved
Hide resolved
|
||
A.sub_block(L_11, 0, 0, offset, offset, block, block); | ||
// Copies the A_21 submatrix from the input matrix, | ||
auto block_offset = offset + block; | ||
A_21.sub_block(A, block_offset, offset, 0, 0, block_subset, block); | ||
// computes A_21*((L_11^-1)^T) | ||
// and copies the resulting submatrix to the input matrix | ||
matrix_gpu A_11_inverse = lower_triangular_inverse(L_11); | ||
A_11_inverse = transpose(A_11_inverse); | ||
SteveBronder marked this conversation as resolved.
Show resolved
Hide resolved
|
||
// TODO(Steve): Replace with mult operator when that PR goes through | ||
matrix_gpu L_21 = multiply(A_21, A_11_inverse); | ||
A.sub_block(L_21, 0, 0, block_offset, offset, block_subset, block); | ||
// Copies the A_22 submatrix from the input matrix, | ||
SteveBronder marked this conversation as resolved.
Show resolved
Hide resolved
|
||
A_22.sub_block(A, block_offset, block_offset, 0, 0, block_subset, | ||
SteveBronder marked this conversation as resolved.
Show resolved
Hide resolved
|
||
block_subset); | ||
// computes A_22 - L_21*(L_21^T) | ||
// and copies the resulting submatrix back to the input matrix | ||
matrix_gpu temp = multiply_transpose(L_21); | ||
// TODO(Steve): Replace with subtraction operator when that PR goes through | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Steve, it may also be cool to code up a way to assign directly to a sub block. We should probably attempt to match Eigen on something like that, though perhaps that's a large ask as their API looks fairly complicated and we may not even want that complication. But somehow assigning directly to a sub-block would really slick up this API. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agree that would be rad!! I've thought about this in passing but having tried anything out yet. For lvalue idt it would be too bad. Off the top of my head something like a templated struct in the matrix_gpu class called Have to think about rvalue, something about overloaded There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you say a little more about the rvalue problem? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ¯\_(ツ)_/¯ Not really a problem, just that I haven't tried anything or thought about it |
||
matrix_gpu L_22 = subtract(A_22, temp); | ||
A.sub_block(L_22, 0, 0, block_offset, block_offset, block_subset, | ||
block_subset); | ||
offset += block; | ||
} | ||
// Computes the Cholesky factor for the remaining part of the matrix | ||
SteveBronder marked this conversation as resolved.
Show resolved
Hide resolved
|
||
const auto remaining_rows = A.rows() - offset; | ||
if (remaining_rows > 0) { | ||
matrix_gpu A_11(remaining_rows, remaining_rows); | ||
matrix_gpu L_11(remaining_rows, remaining_rows); | ||
A_11.sub_block(A, offset, offset, 0, 0, remaining_rows, remaining_rows); | ||
// Calls the blocked cholesky for the submatrix A_11 | ||
// or calls the kernel directly if the size of the block is small enough | ||
if (block <= min_block || divider <= 1) { | ||
L_11.zeros(); | ||
try { | ||
opencl_kernels::cholesky_decompose( | ||
cl::NDRange(A_11.rows()), cl::NDRange(A_11.rows()), A_11.buffer(), | ||
L_11.buffer(), A_11.rows()); | ||
} catch (const cl::Error& e) { | ||
check_opencl_error("cholesky_decompose", e); | ||
} | ||
} else { | ||
L_11 = cholesky_decompose(A_11, block / divider); | ||
} | ||
A.sub_block(L_11, 0, 0, offset, offset, remaining_rows, remaining_rows); | ||
} | ||
check_nan("cholesky_decompose_gpu", "Matrix m", A); | ||
check_diagonal_zeros("cholesky_decompose_gpu", "Matrix m", A); | ||
A.zeros<stan::math::TriangularViewGPU::Upper>(); | ||
return A; | ||
seantalts marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
} // namespace math | ||
} // namespace stan | ||
|
||
#endif | ||
#endif |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
#ifndef STAN_MATH_GPU_KERNELS_CHOLESKY_DECOMPOSE_HPP | ||
#define STAN_MATH_GPU_KERNELS_CHOLESKY_DECOMPOSE_HPP | ||
#ifdef STAN_OPENCL | ||
|
||
#include <stan/math/gpu/kernel_cl.hpp> | ||
|
||
namespace stan { | ||
namespace math { | ||
namespace opencl_kernels { | ||
// \cond | ||
const char *cholesky_decompose_kernel_code = STRINGIFY( | ||
// \endcond | ||
/** | ||
* Calculates the Cholesky Decomposition of a matrix on a GPU | ||
* | ||
* This kernel is run with threads organized in one dimension and | ||
* in a single thread block. The kernel is best suited for | ||
* small input matrices as it only utilizes a single streaming | ||
* multiprocessor. The kernels is used as a part of a blocked | ||
* cholesky decompose. | ||
* | ||
* @param[in] A The input matrix | ||
* @param[in, out] B The result of cholesky decompositon of A. | ||
* It must be set to zeros before execution of this kernel. * | ||
SteveBronder marked this conversation as resolved.
Show resolved
Hide resolved
|
||
* @param rows The number of rows for A and B. | ||
* @note Code is a <code>const char*</code> held in | ||
* <code>cholesky_decompose_kernel_code.</code> | ||
* Used in math/gpu/cholesky_decompose.hpp. | ||
* This kernel uses the helper macros available in helpers.cl. | ||
* | ||
*/ | ||
__kernel void cholesky_decompose(__global double *A, __global double *B, | ||
int rows) { | ||
int local_index = get_local_id(0); | ||
// The following code is the sequential version of the non-inplace | ||
// cholesky decomposition. Only the innermost loops are parallelized. The | ||
SteveBronder marked this conversation as resolved.
Show resolved
Hide resolved
|
||
// rows are processed sequentially. This loop process all the rows: | ||
for (int j = 0; j < rows; j++) { | ||
// First thread calculates the diagonal element | ||
if (local_index == 0) { | ||
double sum = 0; | ||
for (int k = 0; k < j; k++) { | ||
sum += B(j, k) * B(j, k); | ||
SteveBronder marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
B(j, j) = sqrt(A(j, j) - sum); | ||
SteveBronder marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
barrier(CLK_LOCAL_MEM_FENCE); | ||
// Calculates the rest of the row | ||
if (local_index >= (j + 1) && local_index < rows) { | ||
double inner_sum = 0; | ||
for (int k = 0; k < j; k++) { | ||
inner_sum += B(local_index, k) * B(j, k); | ||
} | ||
B(local_index, j) = (1.0 / B(j, j) * (A(local_index, j) - inner_sum)); | ||
} | ||
barrier(CLK_LOCAL_MEM_FENCE); | ||
SteveBronder marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
} | ||
// \cond | ||
); | ||
// \endcond | ||
|
||
/** | ||
* See the docs for \link kernels/cholesky_decompose.hpp cholesky_decompose() | ||
* \endlink | ||
*/ | ||
const local_range_kernel<cl::Buffer, cl::Buffer, int> cholesky_decompose( | ||
"cholesky_decompose", cholesky_decompose_kernel_code); | ||
|
||
} // namespace opencl_kernels | ||
} // namespace math | ||
} // namespace stan | ||
#endif | ||
#endif |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,6 +5,10 @@ | |
#include <stan/math/prim/mat/err/check_pos_definite.hpp> | ||
#include <stan/math/prim/mat/err/check_square.hpp> | ||
#include <stan/math/prim/mat/err/check_symmetric.hpp> | ||
#ifdef STAN_OPENCL | ||
#include <stan/math/gpu/cholesky_decompose.hpp> | ||
#include <algorithm> | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why do you need to include this here? is this for There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Yeah I'm pretty sure this is because cpplint complains when it sees copy without the algorithm header (even if the call is There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, the ccplint complained with the error message to add . There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. boo. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @seantalts is this cool? I can look later to see if there is a way to get that check for copy to only happen if it's not from There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it might be slightly better to try |
||
#endif | ||
|
||
namespace stan { | ||
namespace math { | ||
|
@@ -23,12 +27,20 @@ namespace math { | |
template <typename T> | ||
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic> cholesky_decompose( | ||
const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>& m) { | ||
#ifdef STAN_OPENCL | ||
seantalts marked this conversation as resolved.
Show resolved
Hide resolved
|
||
matrix_gpu m_gpu(m); | ||
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic> m_chol(m.rows(), m.cols()); | ||
cholesky_decompose(m_gpu); | ||
copy(m_chol, m_gpu); | ||
return m_chol; | ||
#else | ||
check_square("cholesky_decompose", "m", m); | ||
check_symmetric("cholesky_decompose", "m", m); | ||
Eigen::LLT<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic> > llt(m.rows()); | ||
llt.compute(m); | ||
check_pos_definite("cholesky_decompose", "m", llt); | ||
return llt.matrixL(); | ||
#endif | ||
} | ||
|
||
} // namespace math | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
#ifdef STAN_OPENCL | ||
#include <stan/math/prim/mat.hpp> | ||
#include <stan/math/gpu/copy.hpp> | ||
#include <stan/math/gpu/cholesky_decompose.hpp> | ||
#include <stan/math/prim/mat/fun/cholesky_decompose.hpp> | ||
#include <stan/math/prim/mat/fun/Eigen.hpp> | ||
#include <stan/math/prim/mat/err/check_pos_definite.hpp> | ||
#include <stan/math/prim/mat/err/check_square.hpp> | ||
#include <stan/math/prim/mat/err/check_symmetric.hpp> | ||
#include <gtest/gtest.h> | ||
#include <algorithm> | ||
#define EXPECT_MATRIX_NEAR(A, B, DELTA) \ | ||
for (int i = 0; i < A.size(); i++) \ | ||
EXPECT_NEAR(A(i), B(i), DELTA); | ||
|
||
TEST(MathMatrix, cholesky_decompose_cpu_vs_gpu_small) { | ||
stan::math::matrix_d m0(3, 3); | ||
m0 << 25, 15, -5, 15, 18, 0, -5, 0, 11; | ||
|
||
stan::math::matrix_d m1(4, 4); | ||
m1 << 18, 22, 54, 42, 22, 70, 86, 62, 54, 86, 174, 134, 42, 62, 134, 106; | ||
|
||
stan::math::matrix_gpu m0_gpu(m0); | ||
stan::math::matrix_gpu m1_gpu(m1); | ||
|
||
stan::math::matrix_d m0_res = stan::math::cholesky_decompose(m0); | ||
stan::math::matrix_d m1_res = stan::math::cholesky_decompose(m1); | ||
|
||
stan::math::matrix_gpu m0_chol_gpu = stan::math::cholesky_decompose(m0_gpu); | ||
seantalts marked this conversation as resolved.
Show resolved
Hide resolved
SteveBronder marked this conversation as resolved.
Show resolved
Hide resolved
|
||
stan::math::matrix_gpu m1_chol_gpu = stan::math::cholesky_decompose(m1_gpu); | ||
|
||
stan::math::copy(m0, m0_chol_gpu); | ||
stan::math::copy(m1, m1_chol_gpu); | ||
|
||
EXPECT_MATRIX_NEAR(m0, m0_res, 1e-8); | ||
EXPECT_MATRIX_NEAR(m1, m1_res, 1e-8); | ||
} | ||
|
||
void cholesky_decompose_test(int size) { | ||
stan::math::matrix_d m1 = stan::math::matrix_d::Random(size, size); | ||
stan::math::matrix_d m1_pos_def | ||
= m1 * m1.transpose() + size * Eigen::MatrixXd::Identity(size, size); | ||
|
||
stan::math::matrix_d m1_cpu(size, size); | ||
stan::math::matrix_d m1_cl(size, size); | ||
|
||
stan::math::check_square("cholesky_decompose", "m", m1_pos_def); | ||
stan::math::check_symmetric("cholesky_decompose", "m", m1_pos_def); | ||
Eigen::LLT<stan::math::matrix_d> llt(m1_pos_def.rows()); | ||
llt.compute(m1_pos_def); | ||
stan::math::check_pos_definite("cholesky_decompose", "m", llt); | ||
m1_cpu = llt.matrixL(); | ||
|
||
m1_cl = stan::math::cholesky_decompose(m1_pos_def); | ||
|
||
double max_error = 0; | ||
for (int i = 0; i < size; i++) { | ||
for (int j = 0; j <= i; j++) { | ||
double abs_err = std::fabs(m1_cpu(i, j) - m1_cl(i, j)); | ||
double a = std::max(abs_err / m1_cpu(i, j), abs_err / m1_cl(i, j)); | ||
max_error = std::max(max_error, a); | ||
} | ||
} | ||
EXPECT_LT(max_error, 1e-8); | ||
} | ||
|
||
TEST(MathMatrix, cholesky_decompose_small) { | ||
cholesky_decompose_test(10); | ||
cholesky_decompose_test(50); | ||
cholesky_decompose_test(100); | ||
} | ||
|
||
TEST(MathMatrix, cholesky_decompose_big) { | ||
cholesky_decompose_test(500); | ||
cholesky_decompose_test(1000); | ||
cholesky_decompose_test(2000); | ||
} | ||
|
||
#endif | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If we end up supporting the |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
we don't have
<code>block</code>
anymore :/