Skip to content

Commit

Permalink
Merge branch 'dev' into fix/goicicle-setup-script
Browse files Browse the repository at this point in the history
  • Loading branch information
LeonHibnik committed Aug 15, 2023
2 parents 36133ba + 19d0730 commit 11fe11b
Show file tree
Hide file tree
Showing 2 changed files with 71 additions and 25 deletions.
47 changes: 26 additions & 21 deletions icicle/appUtils/msm/msm.cu
Original file line number Diff line number Diff line change
Expand Up @@ -130,14 +130,14 @@ __global__ void find_cutoff_kernel(unsigned *v, unsigned size, unsigned cutoff,
return;
}
const unsigned start_index = tid*run_length;
for (int i=start_index;i<min(start_index+run_length,size-1);i++) {
for (int i=start_index;i<min(start_index+run_length,size - 1);i++) {
if (v[i] > cutoff && v[i+1] <= cutoff) {
result[0] = i+1;
return;
}
if (i == size - 1) {
result[0] = 0;
}
}
if (tid == 0 && v[size - 1] > cutoff) {
result[0] = size;
}
}

Expand Down Expand Up @@ -221,7 +221,7 @@ template <typename P>
__global__ void distribute_large_buckets_kernel(P* large_buckets, P* buckets, unsigned *single_bucket_indices, unsigned size){

unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x;
if (tid>=size){
if (tid>=size){
return;
}
buckets[single_bucket_indices[tid]] = large_buckets[tid];
Expand Down Expand Up @@ -292,7 +292,7 @@ void bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsi
S *d_scalars;
A *d_points;
if (!on_device) {
//copy scalars and point to gpu
//copy scalars and points to gpu
cudaMallocAsync(&d_scalars, sizeof(S) * size, stream);
cudaMallocAsync(&d_points, sizeof(A) * size, stream);
cudaMemcpyAsync(d_scalars, scalars, sizeof(S) * size, cudaMemcpyHostToDevice, stream);
Expand Down Expand Up @@ -427,6 +427,7 @@ void bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsi
unsigned bucket_th = large_bucket_factor*avarage_size;
unsigned *nof_large_buckets;
cudaMallocAsync(&nof_large_buckets, sizeof(unsigned), stream);
cudaMemset(nof_large_buckets, 0, sizeof(unsigned));

unsigned TOTAL_THREADS = 129000; //todo - device dependant
unsigned cutoff_run_length = max(2,h_nof_buckets_to_compute/TOTAL_THREADS);
Expand Down Expand Up @@ -457,10 +458,11 @@ void bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsi
unsigned max_bucket_size_run_length = (h_largest_bucket_size + threads_per_bucket - 1) / threads_per_bucket;
unsigned total_large_buckets_size = large_buckets_to_compute*threads_per_bucket;
cudaMallocAsync(&large_buckets, sizeof(P)*total_large_buckets_size, stream);

NUM_THREADS = min(1 << 8,total_large_buckets_size);
NUM_BLOCKS = (total_large_buckets_size + NUM_THREADS - 1) / NUM_THREADS;
accumulate_large_buckets_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream2>>>(large_buckets, sorted_bucket_offsets+h_nof_zero_large_buckets, sorted_bucket_sizes+h_nof_zero_large_buckets, sorted_single_bucket_indices+h_nof_zero_large_buckets, point_indices,
d_points, nof_buckets, large_buckets_to_compute, c+bm_bitsize, c, threads_per_bucket, max_bucket_size_run_length);
d_points, nof_buckets, large_buckets_to_compute, c+bm_bitsize, c, threads_per_bucket, max_bucket_size_run_length);

//reduce
for (int s=total_large_buckets_size>>1;s>large_buckets_to_compute-1;s>>=1) {
Expand All @@ -479,10 +481,14 @@ void bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsi
}

//launch the accumulation kernel with maximum threads
NUM_THREADS = 1 << 8;
NUM_BLOCKS = (h_nof_buckets_to_compute-h_nof_large_buckets + NUM_THREADS - 1) / NUM_THREADS;
accumulate_buckets_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(buckets, sorted_bucket_offsets+h_nof_large_buckets, sorted_bucket_sizes+h_nof_large_buckets, sorted_single_bucket_indices+h_nof_large_buckets, point_indices,
d_points, nof_buckets, h_nof_buckets_to_compute-h_nof_large_buckets, c+bm_bitsize, c);
if (h_nof_buckets_to_compute > h_nof_large_buckets) {
NUM_THREADS = 1 << 8;
NUM_BLOCKS = (h_nof_buckets_to_compute-h_nof_large_buckets + NUM_THREADS - 1) / NUM_THREADS;
accumulate_buckets_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(buckets, sorted_bucket_offsets+h_nof_large_buckets, sorted_bucket_sizes+h_nof_large_buckets, sorted_single_bucket_indices+h_nof_large_buckets, point_indices,
d_points, nof_buckets, h_nof_buckets_to_compute-h_nof_large_buckets, c+bm_bitsize, c);
}

// all the large buckets need to be accumulated before the final summation
cudaStreamSynchronize(stream2);
cudaStreamDestroy(stream2);

Expand All @@ -491,7 +497,7 @@ void bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsi
NUM_THREADS = 1 << 10;
NUM_BLOCKS = (nof_buckets + NUM_THREADS - 1) / NUM_THREADS;
ssm_buckets_kernel<fake_point, fake_scalar><<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(buckets, single_bucket_indices, nof_buckets, c);

//sum each bucket module
P* final_results;
cudaMallocAsync(&final_results, sizeof(P) * nof_bms, stream);
Expand All @@ -500,6 +506,10 @@ void bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsi
sum_reduction_kernel<<<NUM_BLOCKS,NUM_THREADS, 0, stream>>>(buckets, final_results);
#endif

P* d_final_result;
if (!on_device)
cudaMallocAsync(&d_final_result, sizeof(P), stream);

P* final_results;
if (big_triangle){
cudaMallocAsync(&final_results, sizeof(P) * nof_bms, stream);
Expand All @@ -511,7 +521,6 @@ void bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsi
#else
big_triangle_sum_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(buckets, final_results, nof_bms, c);
#endif

}
else {
unsigned source_bits_count = c;
Expand Down Expand Up @@ -541,16 +550,15 @@ void bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsi
unsigned nof_threads = (source_buckets_count>>(1+j));
NUM_THREADS = min(MAX_TH,nof_threads);
NUM_BLOCKS = (nof_threads + NUM_THREADS - 1) / NUM_THREADS;
single_stage_multi_reduction_kernel<<<NUM_BLOCKS, NUM_THREADS,0,stream>>>(j==0?source_buckets:temp_buckets2,j==target_bits_count-1? target_buckets: temp_buckets2,1<<(target_bits_count-j),j==target_bits_count-1? 1<<target_bits_count: 0,1,0) ;

single_stage_multi_reduction_kernel<<<NUM_BLOCKS, NUM_THREADS,0,stream>>>(j==0?source_buckets:temp_buckets2,j==target_bits_count-1? target_buckets: temp_buckets2,1<<(target_bits_count-j),j==target_bits_count-1? 1<<target_bits_count: 0,1,0);
}
}
if (target_bits_count == 1) {
nof_bms = bitsize;
cudaMallocAsync(&final_results, sizeof(P) * nof_bms, stream);
NUM_THREADS = 32;
NUM_BLOCKS = (nof_bms + NUM_THREADS - 1) / NUM_THREADS;
last_pass_kernel<<<NUM_BLOCKS,NUM_THREADS>>>(target_buckets,final_results,nof_bms);
last_pass_kernel<<<NUM_BLOCKS,NUM_THREADS,0,stream>>>(target_buckets,final_results,nof_bms);
c = 1;
cudaFreeAsync(source_buckets,stream);
cudaFreeAsync(target_buckets,stream);
Expand All @@ -572,13 +580,11 @@ void bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsi
}
}

P* d_final_result;
if (!on_device)
cudaMallocAsync(&d_final_result, sizeof(P), stream);

//launch the double and add kernel, a single thread
final_accumulation_kernel<P, S><<<1,1,0,stream>>>(final_results, ones_results, on_device ? final_result : d_final_result, 1, nof_bms, c);
cudaFreeAsync(final_results, stream);
cudaStreamSynchronize(stream);

if (!on_device)
cudaMemcpyAsync(final_result, d_final_result, sizeof(P), cudaMemcpyDeviceToHost, stream);

Expand All @@ -603,7 +609,6 @@ void bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsi
cudaFreeAsync(nof_large_buckets,stream);
cudaFreeAsync(max_res,stream);
if (large_buckets_to_compute>0 && bucket_th>0) cudaFreeAsync(large_buckets,stream);
cudaFreeAsync(final_results, stream);
cudaFreeAsync(ones_results, stream);

cudaStreamSynchronize(stream);
Expand Down
49 changes: 45 additions & 4 deletions src/test_bn254.rs
Original file line number Diff line number Diff line change
Expand Up @@ -825,7 +825,7 @@ pub fn generate_random_points100_bn254(
) -> Vec<PointAffineNoInfinity_BN254> {
let mut res = Vec::new();
for i in 0..count{
if (i<100) {
if (i<1024) {
res.push(Point_BN254::from_ark(G1Projective_BN254::rand(&mut rng)).to_xy_strip_z());
}
else {
Expand Down Expand Up @@ -945,6 +945,14 @@ pub(crate) mod tests_bn254 {
assert!(check_eq(&result2, &points));
}

fn decode_hex(s: &str) -> Vec<u32> {
(0..s.len())
.step_by(8)
.map(|i| u32::from_str_radix(&(s[i..i + 8].chars().rev().collect::<String>()), 16)
.expect(&format!("{:?}", s[i..i + 8].as_bytes())))
.collect()
}

#[test]
fn test_msm() {
let test_sizes = [24];
Expand All @@ -956,7 +964,7 @@ pub(crate) mod tests_bn254 {
let points = generate_random_points100_bn254(count, get_rng_bn254(seed));
let scalars = generate_random_scalars_bn254(count, get_rng_bn254(seed));

let msm_result = msm_bn254(&points, &scalars, 0);
let msm_result = msm_bn254(&points, &scalars, 10);

let point_r_ark: Vec<_> = points.iter().map(|x| x.to_ark_repr()).collect();
let scalars_r_ark: Vec<_> = scalars.iter().map(|x| x.to_ark()).collect();
Expand All @@ -972,6 +980,41 @@ pub(crate) mod tests_bn254 {
}
}

#[test]
fn test_custom_msm_distributions() {
let mut i = 0;
// loop over all the saved distributions: scalars0.txt, scalars1.txt, ...
while let Ok(scalars_file) = std::fs::read_to_string(format!("src/scalars{}.txt", i)) {
let scalars_file = decode_hex(&scalars_file);
let scalars = (0..scalars_file.len()).step_by(8)
.map(|i| ScalarField_BN254::from_limbs(&scalars_file[i..i+8]))
.collect::<Vec<_>>();

let points = if let Ok(points_file) = std::fs::read_to_string(format!("src/points{}.txt", i)) {
let points_file = decode_hex(&points_file);
(0..points_file.len()).step_by(16)
.map(|i| PointAffineNoInfinity_BN254::from_limbs(&points_file[i..i+8], &points_file[i+8..i+16]))
.collect::<Vec<_>>()
} else {
// it doesn't really matter if there are no points.txt file as points shouldn't affect the performance or correctness
let seed = Some(0);
generate_random_points100_bn254(scalars.len(), get_rng_bn254(seed))
};
assert!(points[0].to_ark_repr().is_on_curve());

assert_eq!(scalars.len(), points.len());
let msm_result = msm_bn254(&points, &scalars, 10);

let point_r_ark: Vec<_> = points.iter().map(|x| x.to_ark_repr()).collect();
let scalars_r_ark: Vec<_> = scalars.iter().map(|x| x.to_ark()).collect();

let msm_result_ark = VariableBaseMSM::multi_scalar_mul(&point_r_ark, &scalars_r_ark);

assert_eq!(msm_result.to_ark_affine(), msm_result_ark);
i += 1;
}
}

#[test]
fn test_batch_msm() {
for batch_pow2 in [2, 4] {
Expand Down Expand Up @@ -1463,8 +1506,6 @@ pub(crate) mod tests_bn254 {
let (_, _, mut d_large_domain) = set_up_scalars_bn254(0, log_test_size + 1, false);
let mut d_coset_powers = build_domain_bn254(test_size, log_test_size + 1, false);

println!("d_coset_powers len {}", d_coset_powers.len());

let mut d_evals_large = evaluate_scalars_batch_bn254(&mut d_coeffs, &mut d_large_domain, batch_size);
let mut h_evals_large: Vec<ScalarField_BN254> = (0..2 * test_size * batch_size).map(|_| ScalarField_BN254::zero()).collect();
d_evals_large.copy_to(&mut h_evals_large[..]).unwrap();
Expand Down

0 comments on commit 11fe11b

Please sign in to comment.