Add streams capability (#89)

ingonyama-zk · jeremyfelder · Jun 1, 2023 · May 29, 2023 · May 29, 2023 · May 30, 2023
commit 20de60fd431daeca821f9d3588b1954af398a8bc
diff --git a/...REQUEST_TEMPLATE/pull_request_template.md → .github/pull_request_template.md b/...REQUEST_TEMPLATE/pull_request_template.md → .github/pull_request_template.md
@@ -4,4 +4,4 @@ This PR...
 
 ## Linked Issues
 
-Closes #
+Resolves #
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -12,6 +12,7 @@ on:
 env:
   CARGO_TERM_COLOR: always
   ARCH_TYPE: sm_70
+  DEFAULT_STREAM: per-thread
 
 jobs:
   build-linux:

diff --git a/build.rs b/build.rs
@@ -8,9 +8,12 @@ fn main() {
     println!("cargo:rerun-if-changed=./icicle");
 
     let arch_type = env::var("ARCH_TYPE").unwrap_or(String::from("native"));
+    let stream_type = env::var("DEFAULT_STREAM").unwrap_or(String::from("legacy"));
 
     let mut arch = String::from("-arch=");
     arch.push_str(&arch_type);
+    let mut stream = String::from("-default-stream=");
+    stream.push_str(&stream_type);
 
     let mut nvcc = cc::Build::new();
 
@@ -22,6 +25,7 @@ fn main() {
     nvcc.cuda(true);
     nvcc.debug(false);
     nvcc.flag(&arch);
+    nvcc.flag(&stream);
     nvcc.files([
         "./icicle/curves/index.cu",
     ]);

diff --git a/icicle/appUtils/msm/msm.cu b/icicle/appUtils/msm/msm.cu
diff --git a/icicle/appUtils/msm/msm.cuh b/icicle/appUtils/msm/msm.cuh
@@ -3,19 +3,19 @@
 #pragma once
 
 template <typename S, typename P, typename A>
-void bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsigned size, P* final_result, bool on_device);
+void bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsigned size, P* final_result, bool on_device, cudaStream_t stream);
 
 template <typename S, typename P, typename A>
-void batched_bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsigned batch_size, unsigned msm_size, P* final_results, bool on_device);
+void batched_bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsigned batch_size, unsigned msm_size, P* final_results, bool on_device, cudaStream_t stream);
 
 template <typename S, typename P, typename A>
-void batched_large_msm(S* scalars, A* points, unsigned batch_size, unsigned msm_size, P* result, bool on_device);
+void batched_large_msm(S* scalars, A* points, unsigned batch_size, unsigned msm_size, P* result, bool on_device, cudaStream_t stream);
 
 template <typename S, typename P, typename A>
-void large_msm(S* scalars, A* points, unsigned size, P* result, bool on_device);
+void large_msm(S* scalars, A* points, unsigned size, P* result, bool on_device, cudaStream_t stream);
 
 template <typename S, typename P, typename A>
-void short_msm(S *h_scalars, A *h_points, unsigned size, P* h_final_result, bool on_device);
+void short_msm(S *h_scalars, A *h_points, unsigned size, P* h_final_result, cudaStream_t stream);
 
 template <typename A, typename S, typename P>
 void reference_msm(S* scalars, A* a_points, unsigned size);

diff --git a/icicle/appUtils/ntt/lde.cu b/icicle/appUtils/ntt/lde.cu
@@ -15,19 +15,20 @@
  * @param n Length of `d_domain` array, also equal to the number of evaluations of each polynomial.
  * @param batch_size The size of the batch; the length of `d_evaluations` is `n` * `batch_size`.
  */
-template <typename E, typename S> int interpolate_batch(E * d_out, E * d_evaluations, S * d_domain, unsigned n, unsigned batch_size) {
+template <typename E, typename S> int interpolate_batch(E * d_out, E * d_evaluations, S * d_domain, unsigned n, unsigned batch_size, cudaStream_t stream) {
   uint32_t logn = uint32_t(log(n) / log(2));
-  cudaMemcpy(d_out, d_evaluations, sizeof(E) * n * batch_size, cudaMemcpyDeviceToDevice);
+  cudaMemcpyAsync(d_out, d_evaluations, sizeof(E) * n * batch_size, cudaMemcpyDeviceToDevice, stream);
 
   int NUM_THREADS = min(n / 2, MAX_THREADS_BATCH);
   int NUM_BLOCKS = batch_size * max(int((n / 2) / NUM_THREADS), 1);
   for (uint32_t s = 0; s < logn; s++) //TODO: this loop also can be unrolled
   {
-    ntt_template_kernel <E, S> <<<NUM_BLOCKS, NUM_THREADS>>>(d_out, n, d_domain, n, NUM_BLOCKS, s, false);
+    ntt_template_kernel <E, S> <<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(d_out, n, d_domain, n, NUM_BLOCKS, s, false);
   }
 
   NUM_BLOCKS = (n * batch_size + NUM_THREADS - 1) / NUM_THREADS;
-  template_normalize_kernel <E, S> <<<NUM_BLOCKS, NUM_THREADS>>> (d_out, n * batch_size, S::inv_log_size(logn));
+  template_normalize_kernel <E, S> <<<NUM_BLOCKS, NUM_THREADS, 0, stream>>> (d_out, n * batch_size, S::inv_log_size(logn));
+  cudaStreamSynchronize(stream);
   return 0;
 }
 
@@ -39,8 +40,8 @@ template <typename E, typename S> int interpolate_batch(E * d_out, E * d_evaluat
  * @param d_domain Domain on which the polynomial is evaluated. Must be a subgroup.
  * @param n Length of `d_evaluations` and the size `d_domain` arrays (they should have equal length).
  */
-template <typename E, typename S> int interpolate(E * d_out, E * d_evaluations, S * d_domain, unsigned n) {
-  return interpolate_batch <E, S> (d_out, d_evaluations, d_domain, n, 1);
+template <typename E, typename S> int interpolate(E * d_out, E * d_evaluations, S * d_domain, unsigned n, cudaStream_t stream) {
+  return interpolate_batch <E, S> (d_out, d_evaluations, d_domain, n, 1, stream);
 }
 
 template < typename E > __global__ void fill_array(E * arr, E val, uint32_t n) {
@@ -62,7 +63,7 @@ template < typename E > __global__ void fill_array(E * arr, E val, uint32_t n) {
  * @param coset_powers If `coset` is true, a list of powers `[1, u, u^2, ..., u^{n-1}]` where `u` is the generator of the coset.
  */
 template <typename E, typename S>
-int evaluate_batch(E * d_out, E * d_coefficients, S * d_domain, unsigned domain_size, unsigned n, unsigned batch_size, bool coset, S * coset_powers) {
+int evaluate_batch(E * d_out, E * d_coefficients, S * d_domain, unsigned domain_size, unsigned n, unsigned batch_size, bool coset, S * coset_powers, cudaStream_t stream) {
   uint32_t logn = uint32_t(log(domain_size) / log(2));
   if (domain_size > n) {
     // allocate and initialize an array of stream handles to parallelize data copying across batches
@@ -80,18 +81,19 @@ int evaluate_batch(E * d_out, E * d_coefficients, S * d_domain, unsigned domain_
       cudaStreamDestroy(memcpy_streams[i]);
     }
   } else
-    cudaMemcpy(d_out, d_coefficients, sizeof(E) * domain_size * batch_size, cudaMemcpyDeviceToDevice);
+    cudaMemcpyAsync(d_out, d_coefficients, sizeof(E) * domain_size * batch_size, cudaMemcpyDeviceToDevice, stream);
 
   if (coset)
-    batch_vector_mult(coset_powers, d_out, domain_size, batch_size);
+    batch_vector_mult(coset_powers, d_out, domain_size, batch_size, stream);
 
   int NUM_THREADS = min(domain_size / 2, MAX_THREADS_BATCH);
   int chunks = max(int((domain_size / 2) / NUM_THREADS), 1);
   int NUM_BLOCKS = batch_size * chunks;
   for (uint32_t s = 0; s < logn; s++) //TODO: this loop also can be unrolled
   {
-    ntt_template_kernel <E, S> <<<NUM_BLOCKS, NUM_THREADS>>>(d_out, domain_size, d_domain, domain_size, batch_size * chunks, logn - s - 1, true);
+    ntt_template_kernel <E, S> <<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(d_out, domain_size, d_domain, domain_size, batch_size * chunks, logn - s - 1, true);
   }
+  cudaStreamSynchronize(stream);
   return 0;
 }
 
@@ -107,76 +109,76 @@ int evaluate_batch(E * d_out, E * d_coefficients, S * d_domain, unsigned domain_
  * @param coset_powers If `coset` is true, a list of powers `[1, u, u^2, ..., u^{n-1}]` where `u` is the generator of the coset.
  */
 template <typename E, typename S> 
-int evaluate(E * d_out, E * d_coefficients, S * d_domain, unsigned domain_size, unsigned n, bool coset, S * coset_powers) {
-  return evaluate_batch <E, S> (d_out, d_coefficients, d_domain, domain_size, n, 1, coset, coset_powers);
+int evaluate(E * d_out, E * d_coefficients, S * d_domain, unsigned domain_size, unsigned n, bool coset, S * coset_powers, cudaStream_t stream) {
+  return evaluate_batch <E, S> (d_out, d_coefficients, d_domain, domain_size, n, 1, coset, coset_powers, stream);
 }
 
 template <typename S> 
-int interpolate_scalars(S* d_out, S* d_evaluations, S* d_domain, unsigned n) {
-  return interpolate(d_out, d_evaluations, d_domain, n);
+int interpolate_scalars(S* d_out, S* d_evaluations, S* d_domain, unsigned n, cudaStream_t stream) {
+  return interpolate(d_out, d_evaluations, d_domain, n, stream);
 }
 
 template <typename S> 
-int interpolate_scalars_batch(S* d_out, S* d_evaluations, S* d_domain, unsigned n, unsigned batch_size) {
-  return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size);
+int interpolate_scalars_batch(S* d_out, S* d_evaluations, S* d_domain, unsigned n, unsigned batch_size, cudaStream_t stream) {
+  return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, stream);
 }
 
 template <typename E, typename S> 
-int interpolate_points(E* d_out, E* d_evaluations, S* d_domain, unsigned n) {
-  return interpolate(d_out, d_evaluations, d_domain, n);
+int interpolate_points(E* d_out, E* d_evaluations, S* d_domain, unsigned n, cudaStream_t stream) {
+  return interpolate(d_out, d_evaluations, d_domain, n, stream);
 }
 
 template <typename E, typename S> 
-int interpolate_points_batch(E* d_out, E* d_evaluations, S* d_domain, unsigned n, unsigned batch_size) {
-  return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size);
+int interpolate_points_batch(E* d_out, E* d_evaluations, S* d_domain, unsigned n, unsigned batch_size, cudaStream_t stream) {
+  return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, stream);
 }
 
 template <typename S> 
-int evaluate_scalars(S* d_out, S* d_coefficients, S* d_domain, unsigned domain_size, unsigned n) {
+int evaluate_scalars(S* d_out, S* d_coefficients, S* d_domain, unsigned domain_size, unsigned n, cudaStream_t stream) {
   S* _null = nullptr;
-  return evaluate(d_out, d_coefficients, d_domain, domain_size, n, false, _null);
+  return evaluate(d_out, d_coefficients, d_domain, domain_size, n, false, _null, stream);
 }
 
 template <typename S> 
-int evaluate_scalars_batch(S* d_out, S* d_coefficients, S* d_domain, unsigned domain_size, unsigned n, unsigned batch_size) {
+int evaluate_scalars_batch(S* d_out, S* d_coefficients, S* d_domain, unsigned domain_size, unsigned n, unsigned batch_size, cudaStream_t stream) {
   S* _null = nullptr;
-  return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, false, _null);
+  return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, false, _null, stream);
 }
 
 template <typename E, typename S> 
-int evaluate_points(E* d_out, E* d_coefficients, S* d_domain, unsigned domain_size, unsigned n) {
+int evaluate_points(E* d_out, E* d_coefficients, S* d_domain, unsigned domain_size, unsigned n, cudaStream_t stream) {
   S* _null = nullptr;
-  return evaluate(d_out, d_coefficients, d_domain, domain_size, n, false, _null);
+  return evaluate(d_out, d_coefficients, d_domain, domain_size, n, false, _null, stream);
 }
 
 template <typename E, typename S> 
 int evaluate_points_batch(E* d_out, E* d_coefficients, S* d_domain, 
-                          unsigned domain_size, unsigned n, unsigned batch_size) {
+                          unsigned domain_size, unsigned n, unsigned batch_size, cudaStream_t stream) {
   S* _null = nullptr;
-  return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, false, _null);
+  return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, false, _null, stream);
 }
 
 template <typename S> 
 int evaluate_scalars_on_coset(S* d_out, S* d_coefficients, S* d_domain, 
-                              unsigned domain_size, unsigned n, S* coset_powers) {
-  return evaluate(d_out, d_coefficients, d_domain, domain_size, n, true, coset_powers);
+                              unsigned domain_size, unsigned n, S* coset_powers, cudaStream_t stream) {
+  return evaluate(d_out, d_coefficients, d_domain, domain_size, n, true, coset_powers, stream);
 }
 
 template <typename E, typename S> 
 int evaluate_scalars_on_coset_batch(S* d_out, S* d_coefficients, S* d_domain, unsigned domain_size, 
-                                    unsigned n, unsigned batch_size, S* coset_powers) {
-  return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, true, coset_powers);
+                                    unsigned n, unsigned batch_size, S* coset_powers, cudaStream_t stream) {
+  return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, true, coset_powers, stream);
 }
 
 template <typename E, typename S> 
 int evaluate_points_on_coset(E* d_out, E* d_coefficients, S* d_domain, 
-                             unsigned domain_size, unsigned n, S* coset_powers) {
-  return evaluate(d_out, d_coefficients, d_domain, domain_size, n, true, coset_powers);
+                             unsigned domain_size, unsigned n, S* coset_powers, cudaStream_t stream) {
+  return evaluate(d_out, d_coefficients, d_domain, domain_size, n, true, coset_powers, stream);
 }
 
 template <typename E, typename S> 
 int evaluate_points_on_coset_batch(E* d_out, E* d_coefficients, S* d_domain, unsigned domain_size,
-                                   unsigned n, unsigned batch_size, S* coset_powers) {
-  return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, true, coset_powers);
+                                   unsigned n, unsigned batch_size, S* coset_powers, cudaStream_t stream) {
+  return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, true, coset_powers, stream);
 }
 #endif
diff --git a/icicle/appUtils/ntt/lde.cuh b/icicle/appUtils/ntt/lde.cuh
@@ -3,44 +3,44 @@
 #pragma once
 
 template <typename S> 
-int interpolate_scalars(S* d_out, S* d_evaluations, S* d_domain, unsigned n);
+int interpolate_scalars(S* d_out, S* d_evaluations, S* d_domain, unsigned n, cudaStream_t stream);
 
 template <typename S> 
-int interpolate_scalars_batch(S* d_out, S* d_evaluations, S* d_domain, unsigned n, unsigned batch_size);
+int interpolate_scalars_batch(S* d_out, S* d_evaluations, S* d_domain, unsigned n, unsigned batch_size, cudaStream_t stream);
 
 template <typename E, typename S> 
-int interpolate_points(E* d_out, E* d_evaluations, S* d_domain, unsigned n);
+int interpolate_points(E* d_out, E* d_evaluations, S* d_domain, unsigned n, cudaStream_t stream);
 
 template <typename E, typename S> 
-int interpolate_points_batch(E* d_out, E* d_evaluations, S* d_domain, unsigned n, unsigned batch_size);
+int interpolate_points_batch(E* d_out, E* d_evaluations, S* d_domain, unsigned n, unsigned batch_size, cudaStream_t stream);
 
 template <typename S> 
-int evaluate_scalars(S* d_out, S* d_coefficients, S* d_domain, unsigned domain_size, unsigned n);
+int evaluate_scalars(S* d_out, S* d_coefficients, S* d_domain, unsigned domain_size, unsigned n, cudaStream_t stream);
 
 template <typename S> 
-int evaluate_scalars_batch(S* d_out, S* d_coefficients, S* d_domain, unsigned domain_size, unsigned n, unsigned batch_size);
+int evaluate_scalars_batch(S* d_out, S* d_coefficients, S* d_domain, unsigned domain_size, unsigned n, unsigned batch_size, cudaStream_t stream);
 
 template <typename E, typename S> 
-int evaluate_points(E* d_out, E* d_coefficients, S* d_domain, unsigned domain_size, unsigned n);
+int evaluate_points(E* d_out, E* d_coefficients, S* d_domain, unsigned domain_size, unsigned n, cudaStream_t stream);
 
 template <typename E, typename S> 
 int evaluate_points_batch(E* d_out, E* d_coefficients, S* d_domain, 
-                          unsigned domain_size, unsigned n, unsigned batch_size);
+                          unsigned domain_size, unsigned n, unsigned batch_size, cudaStream_t stream);
 
 template <typename S> 
 int evaluate_scalars_on_coset(S* d_out, S* d_coefficients, S* d_domain, 
-                              unsigned domain_size, unsigned n, S* coset_powers);
+                              unsigned domain_size, unsigned n, S* coset_powers, cudaStream_t stream);
 
 template <typename S>                               
 int evaluate_scalars_on_coset_batch(S* d_out, S* d_coefficients, S* d_domain, unsigned domain_size, 
-                                    unsigned n, unsigned batch_size, S* coset_powers);
+                                    unsigned n, unsigned batch_size, S* coset_powers, cudaStream_t stream);
 
 template <typename E, typename S> 
 int evaluate_points_on_coset(E* d_out, E* d_coefficients, S* d_domain, 
-                             unsigned domain_size, unsigned n, S* coset_powers);
+                             unsigned domain_size, unsigned n, S* coset_powers, cudaStream_t stream);
 
 template <typename E, typename S> 
 int evaluate_points_on_coset_batch(E* d_out, E* d_coefficients, S* d_domain, unsigned domain_size,
-                                   unsigned n, unsigned batch_size, S* coset_powers);
+                                   unsigned n, unsigned batch_size, S* coset_powers, cudaStream_t stream);
 
 #endif