stan-dev · seantalts · Feb 12, 2019 · Oct 28, 2018 · Oct 28, 2018 · Oct 29, 2018
diff --git a/runTests.py b/runTests.py
@@ -40,10 +40,9 @@ def processCLIArgs():
     parser.add_argument("tests", nargs="+", type=str,
                         help=tests_help_msg)
     f_help_msg = "Only tests with file names matching these will be executed.\n"
-    f_help_msg += "Example: '-f chol', '-f gpu', '-f prim mat'"
-    parser.add_argument("-f", nargs="+", type=str, default = "",
+    f_help_msg += "Example: '-f chol', '-f gpu', '-f prim'"
+    parser.add_argument("-f", type=str, default = [], action="append",
                         help=f_help_msg)
-
     parser.add_argument("-d", "--debug", dest="debug", action="store_true",
                         help="request additional script debugging output.")
     parser.add_argument("-m", "--make-only", dest="make_only",
@@ -158,6 +157,8 @@ def main():
     tests = findTests(inputs.tests, inputs.f)
     if not tests:
         stopErr("No matching tests found.", -1)
+    if inputs.debug:
+        print("Collected the following tests:\n", tests)
 
     # pass 1: make test executables
     for batch in batched(tests):

diff --git a/stan/math/gpu/cholesky_decompose.hpp b/stan/math/gpu/cholesky_decompose.hpp
@@ -0,0 +1,101 @@
+#ifndef STAN_MATH_GPU_CHOLESKY_DECOMPOSE_HPP
+#define STAN_MATH_GPU_CHOLESKY_DECOMPOSE_HPP
+#ifdef STAN_OPENCL
+#include <stan/math/gpu/opencl_context.hpp>
+#include <stan/math/gpu/matrix_gpu.hpp>
+#include <stan/math/gpu/kernels/cholesky_decompose.hpp>
+#include <stan/math/gpu/multiply.hpp>
+#include <stan/math/gpu/multiply_transpose.hpp>
+#include <stan/math/gpu/lower_tri_inverse.hpp>
+#include <stan/math/gpu/transpose.hpp>
+#include <stan/math/gpu/subtract.hpp>
+#include <stan/math/gpu/err/check_diagonal_zeros.hpp>
+#include <stan/math/gpu/err/check_nan.hpp>
+#include <CL/cl.hpp>
+#include <algorithm>
+#include <cmath>
+
+namespace stan {
+namespace math {
+/**
+ * Return the lower-triangular Cholesky factor (i.e., matrix
+ * square root) of the specified square, symmetric matrix.
+ * The return value \f$L\f$ will be a lower-traingular matrix such that the
+ * original matrix \f$A\f$ is given by
+ * <p>\f$A = L \times L^T\f$.
+ * The Cholesky decomposition is computed on the GPU. This algorithm is
+ * recursive. The matrix is subset into a matrix of size
+ *  <code>A.rows() / 4</code>, and if the submatrix size is less than
+ * 50 or <code>min_block</code> then the cholesky decomposition on the GPU
+ *  is computed using that submatrix. If the submatrix is greater than
+ * 50 or <code>min_block</code> then <code>cholesky_decompose</code> is run
+ * again on a submatrix with size equal to <code>submat.rows() / 4</code>.
+ * Once the Cholesky Decomposition is computed, the full matrix cholesky
+ * is created by propogating the cholesky forward as given in the reference
+ * report below.
+ *
+ * For a full guide to how this works
+ * see the Cholesy decompostion chapter in the  reference report
+ * <a href="https://goo.gl/6kWkJ5"> here</a>.
+ * @param A Symmetric matrix on the GPU.
+ * @return Square root of matrix on the GPU.
+ * @throw std::domain_error if m is not
+ *  positive definite (if m has more than 0 elements)
+ */
+inline matrix_gpu cholesky_decompose(matrix_gpu& A) {
+  if (A.rows() == 0)
+    return A;
+  // Repeats the blocked cholesky decomposition until the size of the remaining
+  // submatrix is smaller or equal to the minimum blocks size
+  // or a heuristic of 100.
+  // The Cholesky GPU algorithm only uses one local block so we need the matrix
+  // To be less than the max thread block size.
+  if (A.rows() <= opencl_context.tuning_opts().cholesky_min_L11_size) {
+    matrix_gpu L(A.rows(), A.cols());
+    try {
+      opencl_kernels::cholesky_decompose(cl::NDRange(A.rows()),
+                                         cl::NDRange(A.rows()), A.buffer(),
+                                         L.buffer(), A.rows());
+    } catch (const cl::Error& e) {
+      check_opencl_error("cholesky_decompose", e);
+    }
+    return L;
+  }
+  // NOTE: The code in this section follows the naming conventions
+  // in the report linked in the docs.
+  const int block
+      = floor(A.rows() / opencl_context.tuning_opts().cholesky_partition);
+  // Subset the top left block of the input A into A_11
+  matrix_gpu A_11(block, block);
+  A_11.sub_block(A, 0, 0, 0, 0, block, block);
+  // The following function either calls the
+  // blocked cholesky recursively for the submatrix A_11
+  // or calls the kernel  directly if the size of the block is small enough
+  matrix_gpu L_11 = stan::math::cholesky_decompose(A_11);
+  // Copies L_11 back to the input matrix
+  A.sub_block(L_11, 0, 0, 0, 0, block, block);
+
+  const int block_subset = A.rows() - block;
+  matrix_gpu A_21(block_subset, block);
+  A_21.sub_block(A, block, 0, 0, 0, block_subset, block);
+  // computes A_21*((L_11^-1)^T)
+  // and copies the resulting submatrix to the lower left hand corner of A
+  matrix_gpu L_21 = A_21 * transpose(lower_triangular_inverse(L_11));
+  A.sub_block(L_21, 0, 0, block, 0, block_subset, block);
+  matrix_gpu A_22(block_subset, block_subset);
+  A_22.sub_block(A, block, block, 0, 0, block_subset, block_subset);
+  // computes A_22 - L_21*(L_21^T)
+  matrix_gpu L_22 = A_22 - multiply_transpose(L_21);
+  // copy L_22 into A's lower left hand corner
+  matrix_gpu L_rem_11 = stan::math::cholesky_decompose(L_22);
+  A.sub_block(L_rem_11, 0, 0, block, block, block_subset, block_subset);
+  check_nan("cholesky_decompose_gpu", "Matrix m", A);
+  check_diagonal_zeros("cholesky_decompose_gpu", "Matrix m", A);
+  return A;
+}
+
+}  // namespace math
+}  // namespace stan
+
+#endif
+#endif
diff --git a/stan/math/gpu/kernels/cholesky_decompose.hpp b/stan/math/gpu/kernels/cholesky_decompose.hpp
@@ -0,0 +1,80 @@
+#ifndef STAN_MATH_GPU_KERNELS_CHOLESKY_DECOMPOSE_HPP
+#define STAN_MATH_GPU_KERNELS_CHOLESKY_DECOMPOSE_HPP
+#ifdef STAN_OPENCL
+
+#include <stan/math/gpu/kernel_cl.hpp>
+
+namespace stan {
+namespace math {
+namespace opencl_kernels {
+// \cond
+const char *cholesky_decompose_kernel_code = STRINGIFY(
+    // \endcond
+    /**
+     * Calculates the Cholesky Decomposition of a matrix on a GPU
+     *
+     * This kernel is run with threads organized in one dimension and
+     * in a single thread block. The kernel is best suited for
+     * small input matrices as it only utilizes a single streaming
+     * multiprocessor. The kernels is used as a part of a blocked
+     * cholesky decompose.
+     *
+     * @param[in] A The input matrix
+     * @param[in, out] B The result of cholesky decompositon of A.
+     * @param rows The number of rows for A and B.
+     * @note Code is a <code>const char*</code> held in
+     * <code>cholesky_decompose_kernel_code.</code>
+     *  Used in math/gpu/cholesky_decompose.hpp.
+     *  This kernel uses the helper macros available in helpers.cl.
+     *
+     */
+    __kernel void cholesky_decompose(__global double *A, __global double *B,
+                                     int rows) {
+      int local_index = get_local_id(0);
+      // Fill B with zeros
+      // B is square so checking row length is fine for both i and j
+      if (local_index < rows) {
+        for (int k = 0; k < rows; k++) {
+          B(local_index, k) = 0;
+        }
+      }
+      // The following code is the sequential version of the non-inplace
+      // cholesky decomposition. Only the innermost loops are parallelized. The
+      // rows are processed sequentially. This loop process all the rows:
+      for (int j = 0; j < rows; j++) {
+        // First thread calculates the diagonal element
+        if (local_index == 0) {
+          double sum = 0;
+          for (int k = 0; k < j; k++) {
+            sum += B(j, k) * B(j, k);
+          }
+          B(j, j) = sqrt(A(j, j) - sum);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+        // Calculates the rest of the row
+        if (local_index >= (j + 1) && local_index < rows) {
+          double inner_sum = 0;
+          for (int k = 0; k < j; k++) {
+            inner_sum += B(local_index, k) * B(j, k);
+          }
+          B(local_index, j) = (1.0 / B(j, j) * (A(local_index, j) - inner_sum));
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+      }
+    }
+    // \cond
+);
+// \endcond
+
+/**
+ * See the docs for \link kernels/cholesky_decompose.hpp cholesky_decompose()
+ * \endlink
+ */
+const local_range_kernel<cl::Buffer, cl::Buffer, int> cholesky_decompose(
+    "cholesky_decompose", cholesky_decompose_kernel_code);
+
+}  // namespace opencl_kernels
+}  // namespace math
+}  // namespace stan
+#endif
+#endif
diff --git a/stan/math/gpu/kernels/inv_lower_tri_multiply.hpp b/stan/math/gpu/kernels/inv_lower_tri_multiply.hpp
@@ -77,14 +77,19 @@ const char* inv_lower_tri_multiply_kernel_code = STRINGIFY(
           const int local_col = thread_block_col + w * THREAD_BLOCK_SIZE_COL;
           const int local_row = thread_block_row;
           // Element above the diagonal will not be transferred.
-          if (C2_global_col <= C2_global_row) {
+          if (C2_global_col <= C2_global_row && C2_global_col < A_rows
+              && C2_global_row < A_rows) {
             C2_local[local_col][local_row]
                 = A[C2_global_col * A_rows + C2_global_row];
           } else {
             C2_local[local_col][local_row] = 0;
           }
-          A3_local[local_col][local_row]
-              = A[A3_global_col * A_rows + A3_global_row];
+          if (A3_global_col < A_rows && A3_global_row < A_rows) {
+            A3_local[local_col][local_row]
+                = A[A3_global_col * A_rows + A3_global_row];
+          } else {
+            A3_local[local_col][local_row] = 0.0;
+          }
         }
         // Wait until all tile values are loaded to the local memory
         barrier(CLK_LOCAL_MEM_FENCE);

diff --git a/stan/math/gpu/kernels/neg_rect_lower_tri_multiply.hpp b/stan/math/gpu/kernels/neg_rect_lower_tri_multiply.hpp
@@ -73,7 +73,8 @@ const char* neg_rect_lower_tri_multiply_kernel_code = STRINGIFY(
             temp_local[local_col][local_row] = 0.0;
           }
           // Element above the diagonal will not be transferred.
-          if (C1_global_col <= C1_global_row) {
+          if (C1_global_col <= C1_global_row && C1_global_col < A_rows
+              && C1_global_row < A_rows) {
             C1_local[local_col][local_row]
                 = A[C1_global_col * A_rows + C1_global_row];
           } else {
@@ -102,7 +103,9 @@ const char* neg_rect_lower_tri_multiply_kernel_code = STRINGIFY(
       for (int w = 0; w < WORK_PER_THREAD; w++) {
         const int A_global_col
             = A_global_col_offset + w * THREAD_BLOCK_SIZE_COL;
-        A[A_global_col * A_rows + i + rows + offset] = -acc[w];
+        if (A_global_col < A_rows && (i + rows + offset) < A_rows) {
+          A[A_global_col * A_rows + i + rows + offset] = -acc[w];
+        }
       }
     }
     // \cond

diff --git a/stan/math/gpu/multiply.hpp b/stan/math/gpu/multiply.hpp
@@ -57,8 +57,10 @@ inline auto multiply(const matrix_gpu& A, const matrix_gpu& B) {
   check_size_match("multiply (GPU)", "A.cols()", A.cols(), "B.rows()",
                    B.rows());
   matrix_gpu temp(A.rows(), B.cols());
-  if (A.size() == 0 || B.size() == 0)
+  if (A.size() == 0 || B.size() == 0) {
+    temp.zeros();
     return temp;
+  }
   int local = opencl_kernels::matrix_multiply.make_functor.get_opts().at(
       "THREAD_BLOCK_SIZE");
   int Mpad = ((A.rows() + local - 1) / local) * local;

diff --git a/stan/math/gpu/opencl_context.hpp b/stan/math/gpu/opencl_context.hpp
@@ -3,7 +3,7 @@
 #ifdef STAN_OPENCL
 #define __CL_ENABLE_EXCEPTIONS
 
-#define DEVICE_FILTER CL_DEVICE_TYPE_GPU
+#define DEVICE_FILTER CL_DEVICE_TYPE_ALL
 #ifndef OPENCL_DEVICE_ID
 #error OPENCL_DEVICE_ID_NOT_SET
 #endif
@@ -106,6 +106,13 @@ class opencl_context_base {
         base_opts_["THREAD_BLOCK_SIZE"] = thread_block_size_sqrt;
         base_opts_["WORK_PER_THREAD"] = 1;
       }
+      // Thread block size for the Cholesky
+      // TODO(Steve): This should be tuned in a higher part of the stan language
+      if (max_thread_block_size_ >= 256) {
+        tuning_opts_.cholesky_min_L11_size = 256;
+      } else {
+        tuning_opts_.cholesky_min_L11_size = max_thread_block_size_;
+      }
     } catch (const cl::Error& e) {
       check_opencl_error("opencl_context", e);
     }
@@ -133,6 +140,12 @@ class opencl_context_base {
          {"LOWER_TO_UPPER", static_cast<int>(TriangularMapGPU::LowerToUpper)},
          {"THREAD_BLOCK_SIZE", 32},
          {"WORK_PER_THREAD", 8}};
+  // TODO(Steve): Make these tunable during warmup
+  struct tuning_struct {
+    int cholesky_min_L11_size = 256;
+    int cholesky_partition = 4;
+    int cholesky_size_worth_transfer = 1250;
+  } tuning_opts_;
 
   static opencl_context_base& getInstance() {
     static opencl_context_base instance_;
@@ -229,7 +242,7 @@ class opencl_context {
 
       try {
         std::vector<cl::Device> all_devices;
-        platform.getDevices(CL_DEVICE_TYPE_GPU, &all_devices);
+        platform.getDevices(CL_DEVICE_TYPE_ALL, &all_devices);
 
         for (auto device_iter : all_devices) {
           cl::Device device(device_iter);
@@ -304,6 +317,13 @@ class opencl_context {
     return opencl_context_base::getInstance().max_thread_block_size_;
   }
 
+  /**
+   * Returns the thread block size for the Cholesky Decompositions L_11.
+   */
+  inline opencl_context_base::tuning_struct& tuning_opts() {
+    return opencl_context_base::getInstance().tuning_opts_;
+  }
+
   /**
    * Returns a vector containing the OpenCL device used to create the context
    */

diff --git a/stan/math/prim/mat/fun/cholesky_decompose.hpp b/stan/math/prim/mat/fun/cholesky_decompose.hpp
@@ -1,10 +1,17 @@
 #ifndef STAN_MATH_PRIM_MAT_FUN_CHOLESKY_DECOMPOSE_HPP
 #define STAN_MATH_PRIM_MAT_FUN_CHOLESKY_DECOMPOSE_HPP
 
+#include <stan/math/gpu/opencl_context.hpp>
 #include <stan/math/prim/mat/fun/Eigen.hpp>
 #include <stan/math/prim/mat/err/check_pos_definite.hpp>
 #include <stan/math/prim/mat/err/check_square.hpp>
 #include <stan/math/prim/mat/err/check_symmetric.hpp>
+#ifdef STAN_OPENCL
+#include <stan/math/gpu/cholesky_decompose.hpp>
+#include <stan/math/gpu/copy.hpp>
+#endif
+
+#include <cmath>
 
 namespace stan {
 namespace math {
@@ -25,12 +32,27 @@ Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic> cholesky_decompose(
     const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>& m) {
   check_square("cholesky_decompose", "m", m);
   check_symmetric("cholesky_decompose", "m", m);
+#ifdef STAN_OPENCL
+  if (m.rows() >= opencl_context.tuning_opts().cholesky_size_worth_transfer) {
+    matrix_gpu m_gpu(m);
+    Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic> m_chol(m.rows(), m.cols());
+    m_gpu = cholesky_decompose(m_gpu);
+    copy(m_chol, m_gpu);  // NOLINT
+    return m_chol;
+  } else {
+    Eigen::LLT<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic> > llt(m.rows());
+    llt.compute(m);
+    check_pos_definite("cholesky_decompose", "m", llt);
+    return llt.matrixL();
+  }
+#else
   Eigen::LLT<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic> > llt(m.rows());
   llt.compute(m);
   check_pos_definite("cholesky_decompose", "m", llt);
   return llt.matrixL();
+#endif
 }
-
 }  // namespace math
+
 }  // namespace stan
 #endif