Merge branch 'dev' into fix/goicicle-setup-script

Obaid51 · Aug 15, 2023 · 11fe11b · 11fe11b
2 parents 36133ba + 19d0730
commit 11fe11b
Show file tree

Hide file tree

Showing 2 changed files with 71 additions and 25 deletions.
diff --git a/icicle/appUtils/msm/msm.cu b/icicle/appUtils/msm/msm.cu
@@ -130,14 +130,14 @@ __global__ void find_cutoff_kernel(unsigned *v, unsigned size, unsigned cutoff,
     return;
   }
   const unsigned start_index = tid*run_length;
-  for (int i=start_index;i<min(start_index+run_length,size-1);i++) {
+  for (int i=start_index;i<min(start_index+run_length,size - 1);i++) {
     if (v[i] > cutoff && v[i+1] <= cutoff) {
       result[0] = i+1;
       return;
     }
-    if (i == size - 1) {
-      result[0] = 0;
-    }
+  }
+  if (tid == 0 && v[size - 1] > cutoff) {
+    result[0] = size;
   }
 }
 
@@ -221,7 +221,7 @@ template <typename P>
 __global__ void distribute_large_buckets_kernel(P* large_buckets, P* buckets, unsigned *single_bucket_indices, unsigned size){
 
   unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x;
-  if (tid>=size){ 
+  if (tid>=size){
     return;
   }
   buckets[single_bucket_indices[tid]] = large_buckets[tid];
@@ -292,7 +292,7 @@ void bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsi
   S *d_scalars;
   A *d_points;
   if (!on_device) {
-    //copy scalars and point to gpu
+    //copy scalars and points to gpu
     cudaMallocAsync(&d_scalars, sizeof(S) * size, stream);
     cudaMallocAsync(&d_points, sizeof(A) * size, stream);
     cudaMemcpyAsync(d_scalars, scalars, sizeof(S) * size, cudaMemcpyHostToDevice, stream);
@@ -427,6 +427,7 @@ void bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsi
   unsigned bucket_th = large_bucket_factor*avarage_size;
   unsigned *nof_large_buckets;
   cudaMallocAsync(&nof_large_buckets, sizeof(unsigned), stream);
+  cudaMemset(nof_large_buckets, 0, sizeof(unsigned));
 
   unsigned TOTAL_THREADS = 129000; //todo - device dependant
   unsigned cutoff_run_length = max(2,h_nof_buckets_to_compute/TOTAL_THREADS);
@@ -457,10 +458,11 @@ void bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsi
     unsigned max_bucket_size_run_length = (h_largest_bucket_size + threads_per_bucket - 1) / threads_per_bucket;
     unsigned total_large_buckets_size = large_buckets_to_compute*threads_per_bucket;
     cudaMallocAsync(&large_buckets, sizeof(P)*total_large_buckets_size, stream);
+
     NUM_THREADS = min(1 << 8,total_large_buckets_size);
     NUM_BLOCKS = (total_large_buckets_size + NUM_THREADS - 1) / NUM_THREADS;
     accumulate_large_buckets_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream2>>>(large_buckets, sorted_bucket_offsets+h_nof_zero_large_buckets, sorted_bucket_sizes+h_nof_zero_large_buckets, sorted_single_bucket_indices+h_nof_zero_large_buckets, point_indices, 
-    d_points, nof_buckets, large_buckets_to_compute, c+bm_bitsize, c, threads_per_bucket, max_bucket_size_run_length);                   
+    d_points, nof_buckets, large_buckets_to_compute, c+bm_bitsize, c, threads_per_bucket, max_bucket_size_run_length);
 
     //reduce
     for (int s=total_large_buckets_size>>1;s>large_buckets_to_compute-1;s>>=1) {
@@ -479,10 +481,14 @@ void bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsi
   }
 
   //launch the accumulation kernel with maximum threads
-  NUM_THREADS = 1 << 8;
-  NUM_BLOCKS = (h_nof_buckets_to_compute-h_nof_large_buckets + NUM_THREADS - 1) / NUM_THREADS;
-  accumulate_buckets_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(buckets, sorted_bucket_offsets+h_nof_large_buckets, sorted_bucket_sizes+h_nof_large_buckets, sorted_single_bucket_indices+h_nof_large_buckets, point_indices, 
-                                                          d_points, nof_buckets, h_nof_buckets_to_compute-h_nof_large_buckets, c+bm_bitsize, c);                   
+  if (h_nof_buckets_to_compute > h_nof_large_buckets) {
+    NUM_THREADS = 1 << 8;
+    NUM_BLOCKS = (h_nof_buckets_to_compute-h_nof_large_buckets + NUM_THREADS - 1) / NUM_THREADS;
+    accumulate_buckets_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(buckets, sorted_bucket_offsets+h_nof_large_buckets, sorted_bucket_sizes+h_nof_large_buckets, sorted_single_bucket_indices+h_nof_large_buckets, point_indices, 
+                                                            d_points, nof_buckets, h_nof_buckets_to_compute-h_nof_large_buckets, c+bm_bitsize, c);
+  }
+
+  // all the large buckets need to be accumulated before the final summation
   cudaStreamSynchronize(stream2);
   cudaStreamDestroy(stream2);
 
@@ -491,7 +497,7 @@ void bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsi
     NUM_THREADS = 1 << 10;
     NUM_BLOCKS = (nof_buckets + NUM_THREADS - 1) / NUM_THREADS;
     ssm_buckets_kernel<fake_point, fake_scalar><<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(buckets, single_bucket_indices, nof_buckets, c);
-   
+
     //sum each bucket module
     P* final_results;
     cudaMallocAsync(&final_results, sizeof(P) * nof_bms, stream);
@@ -500,6 +506,10 @@ void bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsi
     sum_reduction_kernel<<<NUM_BLOCKS,NUM_THREADS, 0, stream>>>(buckets, final_results);
   #endif
 
+  P* d_final_result;
+  if (!on_device)
+    cudaMallocAsync(&d_final_result, sizeof(P), stream);
+
   P* final_results;
   if (big_triangle){
     cudaMallocAsync(&final_results, sizeof(P) * nof_bms, stream);
@@ -511,7 +521,6 @@ void bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsi
     #else
     big_triangle_sum_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(buckets, final_results, nof_bms, c); 
     #endif
-
   }
   else {
     unsigned source_bits_count = c;
@@ -541,16 +550,15 @@ void bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsi
           unsigned nof_threads = (source_buckets_count>>(1+j));
           NUM_THREADS = min(MAX_TH,nof_threads);
           NUM_BLOCKS = (nof_threads + NUM_THREADS - 1) / NUM_THREADS;
-          single_stage_multi_reduction_kernel<<<NUM_BLOCKS, NUM_THREADS,0,stream>>>(j==0?source_buckets:temp_buckets2,j==target_bits_count-1? target_buckets: temp_buckets2,1<<(target_bits_count-j),j==target_bits_count-1? 1<<target_bits_count: 0,1,0)  ;
-
+          single_stage_multi_reduction_kernel<<<NUM_BLOCKS, NUM_THREADS,0,stream>>>(j==0?source_buckets:temp_buckets2,j==target_bits_count-1? target_buckets: temp_buckets2,1<<(target_bits_count-j),j==target_bits_count-1? 1<<target_bits_count: 0,1,0);
         }
       }
       if (target_bits_count == 1) {
         nof_bms = bitsize;
         cudaMallocAsync(&final_results, sizeof(P) * nof_bms, stream);
         NUM_THREADS = 32;
         NUM_BLOCKS = (nof_bms + NUM_THREADS - 1) / NUM_THREADS;
-        last_pass_kernel<<<NUM_BLOCKS,NUM_THREADS>>>(target_buckets,final_results,nof_bms);
+        last_pass_kernel<<<NUM_BLOCKS,NUM_THREADS,0,stream>>>(target_buckets,final_results,nof_bms);
         c = 1;
         cudaFreeAsync(source_buckets,stream);
         cudaFreeAsync(target_buckets,stream);
@@ -572,13 +580,11 @@ void bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsi
     }
   }
 
-  P* d_final_result;
-  if (!on_device)
-    cudaMallocAsync(&d_final_result, sizeof(P), stream);
-
   //launch the double and add kernel, a single thread
   final_accumulation_kernel<P, S><<<1,1,0,stream>>>(final_results, ones_results, on_device ? final_result : d_final_result, 1, nof_bms, c);
+  cudaFreeAsync(final_results, stream);
   cudaStreamSynchronize(stream);
+
   if (!on_device)
     cudaMemcpyAsync(final_result, d_final_result, sizeof(P), cudaMemcpyDeviceToHost, stream);
 
@@ -603,7 +609,6 @@ void bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsi
   cudaFreeAsync(nof_large_buckets,stream);
   cudaFreeAsync(max_res,stream);
   if (large_buckets_to_compute>0 && bucket_th>0) cudaFreeAsync(large_buckets,stream);
-  cudaFreeAsync(final_results, stream);
   cudaFreeAsync(ones_results, stream);
 
   cudaStreamSynchronize(stream);

diff --git a/src/test_bn254.rs b/src/test_bn254.rs
@@ -825,7 +825,7 @@ pub fn generate_random_points100_bn254(
 ) -> Vec<PointAffineNoInfinity_BN254> {
     let mut res =  Vec::new();
     for i in 0..count{
-        if (i<100) {
+        if (i<1024) {
             res.push(Point_BN254::from_ark(G1Projective_BN254::rand(&mut rng)).to_xy_strip_z());
         }
         else {
@@ -945,6 +945,14 @@ pub(crate) mod tests_bn254 {
         assert!(check_eq(&result2, &points));
     }
 
+    fn decode_hex(s: &str) -> Vec<u32> {
+        (0..s.len())
+            .step_by(8)
+            .map(|i| u32::from_str_radix(&(s[i..i + 8].chars().rev().collect::<String>()), 16)
+            .expect(&format!("{:?}", s[i..i + 8].as_bytes())))
+            .collect()
+    }
+
     #[test]
     fn test_msm() {
         let test_sizes = [24];
@@ -956,7 +964,7 @@ pub(crate) mod tests_bn254 {
             let points = generate_random_points100_bn254(count, get_rng_bn254(seed));
             let scalars = generate_random_scalars_bn254(count, get_rng_bn254(seed));
 
-            let msm_result = msm_bn254(&points, &scalars, 0);
+            let msm_result = msm_bn254(&points, &scalars, 10);
 
             let point_r_ark: Vec<_> = points.iter().map(|x| x.to_ark_repr()).collect();
             let scalars_r_ark: Vec<_> = scalars.iter().map(|x| x.to_ark()).collect();
@@ -972,6 +980,41 @@ pub(crate) mod tests_bn254 {
         }
     }
 
+    #[test]
+    fn test_custom_msm_distributions() {
+        let mut i = 0;
+        // loop over all the saved distributions: scalars0.txt, scalars1.txt, ...
+        while let Ok(scalars_file) = std::fs::read_to_string(format!("src/scalars{}.txt", i)) {
+            let scalars_file = decode_hex(&scalars_file);
+            let scalars = (0..scalars_file.len()).step_by(8)
+                                                .map(|i| ScalarField_BN254::from_limbs(&scalars_file[i..i+8]))
+                                                .collect::<Vec<_>>();
+
+            let points = if let Ok(points_file) = std::fs::read_to_string(format!("src/points{}.txt", i)) {
+                let points_file = decode_hex(&points_file);
+                (0..points_file.len()).step_by(16)
+                    .map(|i| PointAffineNoInfinity_BN254::from_limbs(&points_file[i..i+8], &points_file[i+8..i+16]))
+                    .collect::<Vec<_>>()
+            } else {
+                // it doesn't really matter if there are no points.txt file as points shouldn't affect the performance or correctness
+                let seed = Some(0);
+                generate_random_points100_bn254(scalars.len(), get_rng_bn254(seed))
+            };
+            assert!(points[0].to_ark_repr().is_on_curve());
+
+            assert_eq!(scalars.len(), points.len());
+            let msm_result = msm_bn254(&points, &scalars, 10);
+
+            let point_r_ark: Vec<_> = points.iter().map(|x| x.to_ark_repr()).collect();
+            let scalars_r_ark: Vec<_> = scalars.iter().map(|x| x.to_ark()).collect();
+
+            let msm_result_ark = VariableBaseMSM::multi_scalar_mul(&point_r_ark, &scalars_r_ark);
+
+            assert_eq!(msm_result.to_ark_affine(), msm_result_ark);
+            i += 1;
+        }
+    }
+
     #[test]
     fn test_batch_msm() {
         for batch_pow2 in [2, 4] {
@@ -1463,8 +1506,6 @@ pub(crate) mod tests_bn254 {
         let (_, _, mut d_large_domain) = set_up_scalars_bn254(0, log_test_size + 1, false);
         let mut d_coset_powers = build_domain_bn254(test_size, log_test_size + 1, false);
 
-        println!("d_coset_powers len {}", d_coset_powers.len());
-
         let mut d_evals_large = evaluate_scalars_batch_bn254(&mut d_coeffs, &mut d_large_domain, batch_size);
         let mut h_evals_large: Vec<ScalarField_BN254> = (0..2 * test_size * batch_size).map(|_| ScalarField_BN254::zero()).collect();
         d_evals_large.copy_to(&mut h_evals_large[..]).unwrap();