stan-dev · seantalts · Feb 12, 2019 · Oct 28, 2018 · Oct 28, 2018 · Oct 29, 2018
diff --git a/stan/math/gpu/cholesky_decompose.hpp b/stan/math/gpu/cholesky_decompose.hpp
@@ -0,0 +1,132 @@
+#ifndef STAN_MATH_GPU_CHOLESKY_DECOMPOSE_HPP
+#define STAN_MATH_GPU_CHOLESKY_DECOMPOSE_HPP
+#ifdef STAN_OPENCL
+#include <stan/math/gpu/matrix_gpu.hpp>
+#include <stan/math/gpu/kernels/cholesky_decompose.hpp>
+#include <stan/math/gpu/multiply.hpp>
+#include <stan/math/gpu/multiply_transpose.hpp>
+#include <stan/math/gpu/lower_tri_inverse.hpp>
+#include <stan/math/gpu/transpose.hpp>
+#include <stan/math/gpu/subtract.hpp>
+#include <stan/math/gpu/err/check_diagonal_zeros.hpp>
+#include <stan/math/gpu/err/check_nan.hpp>
+#include <CL/cl.hpp>
+#include <algorithm>
+namespace stan {
+namespace math {
+/**
+ * Return the lower-triangular Cholesky factor (i.e., matrix
+ * square root) of the specified square, symmetric matrix.
+ * The return value \f$L\f$ will be a lower-traingular matrix such that the
+ * original matrix \f$A\f$ is given by
+ * <p>\f$A = L \times L^T\f$.
+ * The Cholesky decomposition is computed on the GPU. This algorithm is
+ * recursive, where The parameters <code>block</code>, <code>divider</code>, and
+ *  <code>min_block</code> act as tuning parameters for the recursive step of
+ *  the GPU based Cholesky decompostion. The matrix is subset by the
+ *  <code>block</code> size, and if the <code>block</code> size is less than
+ * <code>min_block</code> then the cholesky decomposition on the GPU is computed
+ * using that submatrix. If <code>block</code> is greater than
+ * <code>block_size</code> then <code>cholesky_decompose</code> is run again
+ * with <code>block</code> equal to <code>block/divider</code>. Once the
+ * Cholesky Decomposition is computed, the full matrix cholesky is created
+ * by propogating the cholesky forward as given in the reference report below.
+ *
+ * For a full guide to how this works
+ * see the Cholesy decompostion chapter in the  reference report
+ * <a href="https://goo.gl/6kWkJ5"> here</a>.
+ * @param A Symmetric matrix on the GPU.
+ * @param block Size of the block used to compute the cholesky decomposition.
+ * @param divider Proportion to divide the submatrix by at each recursive step.
+ * @param min_block The amount that block is checked against to decide
+ *  whether to continue the recursion or to perform the cholesky.
+ * @return Square root of matrix on the GPU.
+ * @throw std::domain_error if m is not
+ *  positive definite (if m has more than 0 elements)
+ */
+inline matrix_gpu cholesky_decompose(matrix_gpu& A, const int block = 100,
+                                     const int divider = 2,
+                                     const int min_block = 100) {
+  auto offset = 0;
+  // NOTE: The code in this section follows the naming conventions
+  // in the report linked in the docs.
+  matrix_gpu A_11(block, block);
+  matrix_gpu L_11(block, block);
+  // Repeats the blocked cholesky decomposition until the size of the remaining
+  // submatrix is smaller or equal to the block size
+  while ((offset + block) < (A.rows())) {
+    auto block_subset = A.rows() - offset - block;
+    matrix_gpu A_21(block_subset, block);
+    matrix_gpu A_22(block_subset, block_subset);
+    // Copies the A_11 submatrix from the input
+    A_11.sub_block(A, offset, offset, 0, 0, block, block);
+    // Calls the blocked cholesky for the submatrix A_11
+    // or calls the kernel  directly if the size of the block is small enough
+    if (block <= min_block || divider <= 1) {
+      L_11.zeros();
+      try {
+        opencl_kernels::cholesky_decompose(
+            cl::NDRange(A_11.rows()), cl::NDRange(A_11.rows()), A_11.buffer(),
+            L_11.buffer(), A_11.rows());
+      } catch (const cl::Error& e) {
+        check_opencl_error("cholesky_decompose", e);
+      }
+    } else {
+      L_11 = cholesky_decompose(A_11, block / divider, divider, min_block);
+    }
+    // Copies the cholesky factor of A_11 back to the input matrix
+    A.sub_block(L_11, 0, 0, offset, offset, block, block);
+    // Copies the A_21 submatrix from the input matrix,
+    auto block_offset = offset + block;
+    A_21.sub_block(A, block_offset, offset, 0, 0, block_subset, block);
+    // computes A_21*((L_11^-1)^T)
+    // and copies the resulting submatrix to the input matrix
+    matrix_gpu A_11_inverse = lower_triangular_inverse(L_11);
+    A_11_inverse = transpose(A_11_inverse);
+    // TODO(Steve): Replace with mult operator when that PR goes through
+    matrix_gpu L_21 = multiply(A_21, A_11_inverse);
+    A.sub_block(L_21, 0, 0, block_offset, offset, block_subset, block);
+    // Copies the A_22 submatrix from the input matrix,
+    A_22.sub_block(A, block_offset, block_offset, 0, 0, block_subset,
+                   block_subset);
+    // computes A_22 - L_21*(L_21^T)
+    // and copies the resulting submatrix back to the input matrix
+    matrix_gpu temp = multiply_transpose(L_21);
+    // TODO(Steve): Replace with subtraction operator when that PR goes through
+    matrix_gpu L_22 = subtract(A_22, temp);
+    A.sub_block(L_22, 0, 0, block_offset, block_offset, block_subset,
+                block_subset);
+    offset += block;
+  }
+  // Computes the Cholesky factor for the remaining part of the matrix
+  const auto remaining_rows = A.rows() - offset;
+  if (remaining_rows > 0) {
+    matrix_gpu A_11(remaining_rows, remaining_rows);
+    matrix_gpu L_11(remaining_rows, remaining_rows);
+    A_11.sub_block(A, offset, offset, 0, 0, remaining_rows, remaining_rows);
+    // Calls the blocked cholesky for the submatrix A_11
+    // or calls the kernel  directly if the size of the block is small enough
+    if (block <= min_block || divider <= 1) {
+      L_11.zeros();
+      try {
+        opencl_kernels::cholesky_decompose(
+            cl::NDRange(A_11.rows()), cl::NDRange(A_11.rows()), A_11.buffer(),
+            L_11.buffer(), A_11.rows());
+      } catch (const cl::Error& e) {
+        check_opencl_error("cholesky_decompose", e);
+      }
+    } else {
+      L_11 = cholesky_decompose(A_11, block / divider);
+    }
+    A.sub_block(L_11, 0, 0, offset, offset, remaining_rows, remaining_rows);
+  }
+  check_nan("cholesky_decompose_gpu", "Matrix m", A);
+  check_diagonal_zeros("cholesky_decompose_gpu", "Matrix m", A);
+  A.zeros<stan::math::TriangularViewGPU::Upper>();
+  return A;
+}
+}  // namespace math
+}  // namespace stan
+
+#endif
+#endif
diff --git a/stan/math/gpu/kernels/cholesky_decompose.hpp b/stan/math/gpu/kernels/cholesky_decompose.hpp
@@ -0,0 +1,74 @@
+#ifndef STAN_MATH_GPU_KERNELS_CHOLESKY_DECOMPOSE_HPP
+#define STAN_MATH_GPU_KERNELS_CHOLESKY_DECOMPOSE_HPP
+#ifdef STAN_OPENCL
+
+#include <stan/math/gpu/kernel_cl.hpp>
+
+namespace stan {
+namespace math {
+namespace opencl_kernels {
+// \cond
+const char *cholesky_decompose_kernel_code = STRINGIFY(
+    // \endcond
+    /**
+     * Calculates the Cholesky Decomposition of a matrix on a GPU
+     *
+     * This kernel is run with threads organized in one dimension and
+     * in a single thread block. The kernel is best suited for
+     * small input matrices as it only utilizes a single streaming
+     * multiprocessor. The kernels is used as a part of a blocked
+     * cholesky decompose.
+     *
+     * @param[in] A The input matrix
+     * @param[in, out] B The result of cholesky decompositon of A.
+     *  It must be set to zeros before execution of this kernel.     *
+     * @param rows The number of rows for A and B.
+     * @note Code is a <code>const char*</code> held in
+     * <code>cholesky_decompose_kernel_code.</code>
+     *  Used in math/gpu/cholesky_decompose.hpp.
+     *  This kernel uses the helper macros available in helpers.cl.
+     *
+     */
+    __kernel void cholesky_decompose(__global double *A, __global double *B,
+                                     int rows) {
+      int local_index = get_local_id(0);
+      // The following code is the sequential version of the non-inplace
+      // cholesky decomposition. Only the innermost loops are parallelized. The
+      // rows are processed sequentially. This loop process all the rows:
+      for (int j = 0; j < rows; j++) {
+        // First thread calculates the diagonal element
+        if (local_index == 0) {
+          double sum = 0;
+          for (int k = 0; k < j; k++) {
+            sum += B(j, k) * B(j, k);
+          }
+          B(j, j) = sqrt(A(j, j) - sum);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+        // Calculates the rest of the row
+        if (local_index >= (j + 1) && local_index < rows) {
+          double inner_sum = 0;
+          for (int k = 0; k < j; k++) {
+            inner_sum += B(local_index, k) * B(j, k);
+          }
+          B(local_index, j) = (1.0 / B(j, j) * (A(local_index, j) - inner_sum));
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+      }
+    }
+    // \cond
+);
+// \endcond
+
+/**
+ * See the docs for \link kernels/cholesky_decompose.hpp cholesky_decompose()
+ * \endlink
+ */
+const local_range_kernel<cl::Buffer, cl::Buffer, int> cholesky_decompose(
+    "cholesky_decompose", cholesky_decompose_kernel_code);
+
+}  // namespace opencl_kernels
+}  // namespace math
+}  // namespace stan
+#endif
+#endif
diff --git a/stan/math/gpu/kernels/inv_lower_tri_multiply.hpp b/stan/math/gpu/kernels/inv_lower_tri_multiply.hpp
@@ -77,14 +77,19 @@ const char* inv_lower_tri_multiply_kernel_code = STRINGIFY(
           const int local_col = thread_block_col + w * THREAD_BLOCK_SIZE_COL;
           const int local_row = thread_block_row;
           // Element above the diagonal will not be transferred.
-          if (C2_global_col <= C2_global_row) {
+          if (C2_global_col <= C2_global_row && C2_global_col < A_rows
+              && C2_global_row < A_rows) {
             C2_local[local_col][local_row]
                 = A[C2_global_col * A_rows + C2_global_row];
           } else {
             C2_local[local_col][local_row] = 0;
           }
-          A3_local[local_col][local_row]
-              = A[A3_global_col * A_rows + A3_global_row];
+          if (A3_global_col < A_rows && A3_global_row < A_rows) {
+            A3_local[local_col][local_row]
+                = A[A3_global_col * A_rows + A3_global_row];
+          } else {
+            A3_local[local_col][local_row] = 0.0;
+          }
         }
         // Wait until all tile values are loaded to the local memory
         barrier(CLK_LOCAL_MEM_FENCE);

diff --git a/stan/math/gpu/kernels/neg_rect_lower_tri_multiply.hpp b/stan/math/gpu/kernels/neg_rect_lower_tri_multiply.hpp
@@ -73,7 +73,8 @@ const char* neg_rect_lower_tri_multiply_kernel_code = STRINGIFY(
             temp_local[local_col][local_row] = 0.0;
           }
           // Element above the diagonal will not be transferred.
-          if (C1_global_col <= C1_global_row) {
+          if (C1_global_col <= C1_global_row && C1_global_col < A_rows
+              && C1_global_row < A_rows) {
             C1_local[local_col][local_row]
                 = A[C1_global_col * A_rows + C1_global_row];
           } else {
@@ -102,7 +103,9 @@ const char* neg_rect_lower_tri_multiply_kernel_code = STRINGIFY(
       for (int w = 0; w < WORK_PER_THREAD; w++) {
         const int A_global_col
             = A_global_col_offset + w * THREAD_BLOCK_SIZE_COL;
-        A[A_global_col * A_rows + i + rows + offset] = -acc[w];
+        if (A_global_col < A_rows && (i + rows + offset) < A_rows) {
+          A[A_global_col * A_rows + i + rows + offset] = -acc[w];
+        }
       }
     }
     // \cond

diff --git a/stan/math/prim/mat/fun/cholesky_decompose.hpp b/stan/math/prim/mat/fun/cholesky_decompose.hpp
@@ -5,6 +5,10 @@
 #include <stan/math/prim/mat/err/check_pos_definite.hpp>
 #include <stan/math/prim/mat/err/check_square.hpp>
 #include <stan/math/prim/mat/err/check_symmetric.hpp>
+#ifdef STAN_OPENCL
+#include <stan/math/gpu/cholesky_decompose.hpp>
+#include <algorithm>
+#endif
 
 namespace stan {
 namespace math {
@@ -23,12 +27,20 @@ namespace math {
 template <typename T>
 Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic> cholesky_decompose(
     const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>& m) {
+#ifdef STAN_OPENCL
+  matrix_gpu m_gpu(m);
+  Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic> m_chol(m.rows(), m.cols());
+  cholesky_decompose(m_gpu);
+  copy(m_chol, m_gpu);
+  return m_chol;
+#else
   check_square("cholesky_decompose", "m", m);
   check_symmetric("cholesky_decompose", "m", m);
   Eigen::LLT<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic> > llt(m.rows());
   llt.compute(m);
   check_pos_definite("cholesky_decompose", "m", llt);
   return llt.matrixL();
+#endif
 }
 
 }  // namespace math

diff --git a/test/unit/math/gpu/cholesky_decompose_test.cpp b/test/unit/math/gpu/cholesky_decompose_test.cpp
@@ -0,0 +1,79 @@
+#ifdef STAN_OPENCL
+#include <stan/math/prim/mat.hpp>
+#include <stan/math/gpu/copy.hpp>
+#include <stan/math/gpu/cholesky_decompose.hpp>
+#include <stan/math/prim/mat/fun/cholesky_decompose.hpp>
+#include <stan/math/prim/mat/fun/Eigen.hpp>
+#include <stan/math/prim/mat/err/check_pos_definite.hpp>
+#include <stan/math/prim/mat/err/check_square.hpp>
+#include <stan/math/prim/mat/err/check_symmetric.hpp>
+#include <gtest/gtest.h>
+#include <algorithm>
+#define EXPECT_MATRIX_NEAR(A, B, DELTA) \
+  for (int i = 0; i < A.size(); i++)    \
+    EXPECT_NEAR(A(i), B(i), DELTA);
+
+TEST(MathMatrix, cholesky_decompose_cpu_vs_gpu_small) {
+  stan::math::matrix_d m0(3, 3);
+  m0 << 25, 15, -5, 15, 18, 0, -5, 0, 11;
+
+  stan::math::matrix_d m1(4, 4);
+  m1 << 18, 22, 54, 42, 22, 70, 86, 62, 54, 86, 174, 134, 42, 62, 134, 106;
+
+  stan::math::matrix_gpu m0_gpu(m0);
+  stan::math::matrix_gpu m1_gpu(m1);
+
+  stan::math::matrix_d m0_res = stan::math::cholesky_decompose(m0);
+  stan::math::matrix_d m1_res = stan::math::cholesky_decompose(m1);
+
+  stan::math::matrix_gpu m0_chol_gpu = stan::math::cholesky_decompose(m0_gpu);
+  stan::math::matrix_gpu m1_chol_gpu = stan::math::cholesky_decompose(m1_gpu);
+
+  stan::math::copy(m0, m0_chol_gpu);
+  stan::math::copy(m1, m1_chol_gpu);
+
+  EXPECT_MATRIX_NEAR(m0, m0_res, 1e-8);
+  EXPECT_MATRIX_NEAR(m1, m1_res, 1e-8);
+}
+
+void cholesky_decompose_test(int size) {
+  stan::math::matrix_d m1 = stan::math::matrix_d::Random(size, size);
+  stan::math::matrix_d m1_pos_def
+      = m1 * m1.transpose() + size * Eigen::MatrixXd::Identity(size, size);
+
+  stan::math::matrix_d m1_cpu(size, size);
+  stan::math::matrix_d m1_cl(size, size);
+
+  stan::math::check_square("cholesky_decompose", "m", m1_pos_def);
+  stan::math::check_symmetric("cholesky_decompose", "m", m1_pos_def);
+  Eigen::LLT<stan::math::matrix_d> llt(m1_pos_def.rows());
+  llt.compute(m1_pos_def);
+  stan::math::check_pos_definite("cholesky_decompose", "m", llt);
+  m1_cpu = llt.matrixL();
+
+  m1_cl = stan::math::cholesky_decompose(m1_pos_def);
+
+  double max_error = 0;
+  for (int i = 0; i < size; i++) {
+    for (int j = 0; j <= i; j++) {
+      double abs_err = std::fabs(m1_cpu(i, j) - m1_cl(i, j));
+      double a = std::max(abs_err / m1_cpu(i, j), abs_err / m1_cl(i, j));
+      max_error = std::max(max_error, a);
+    }
+  }
+  EXPECT_LT(max_error, 1e-8);
+}
+
+TEST(MathMatrix, cholesky_decompose_small) {
+  cholesky_decompose_test(10);
+  cholesky_decompose_test(50);
+  cholesky_decompose_test(100);
+}
+
+TEST(MathMatrix, cholesky_decompose_big) {
+  cholesky_decompose_test(500);
+  cholesky_decompose_test(1000);
+  cholesky_decompose_test(2000);
+}
+
+#endif