Skip to content

Commit

Permalink
G2 CUDA (#78)
Browse files Browse the repository at this point in the history
G2 arithmetic and MSM implemented in CUDA
  • Loading branch information
DmytroTym authored and jeremyfelder committed Jun 1, 2023
1 parent 9ebf3d4 commit b5a24d8
Show file tree
Hide file tree
Showing 30 changed files with 864 additions and 437 deletions.
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -46,3 +46,4 @@ cc = { version = "1.0", features = ["parallel"] }
[features]
default = ["bls12_381"]
bls12_381 = ["ark-bls12-381/curve"]
g2 = []
20 changes: 19 additions & 1 deletion benches/msm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,26 +6,44 @@ use icicle_utils::test_bls12_381::{
commit_batch_bls12_381, generate_random_points_bls12_381, set_up_scalars_bls12_381,
};
use icicle_utils::utils::*;
#[cfg(feature = "g2")]
use icicle_utils::{commit_batch_g2, field::ExtensionField};

use rustacuda::prelude::*;

const LOG_MSM_SIZES: [usize; 1] = [12];
const BATCH_SIZES: [usize; 2] = [128, 256];

fn bench_msm(c: &mut Criterion) {
let mut group = c.benchmark_group("MSM");
for log_msm_size in LOG_MSM_SIZES {
for batch_size in BATCH_SIZES {
let msm_size = 1 << log_msm_size;
let (scalars, _, _) = set_up_scalars_bls12_381(msm_size, 0, false);
let batch_scalars = vec![scalars; batch_size].concat();
let mut d_scalars = DeviceBuffer::from_slice(&batch_scalars[..]).unwrap();

let points = generate_random_points_bls12_381(msm_size, get_rng(None));
let batch_points = vec![points; batch_size].concat();
let mut d_points = DeviceBuffer::from_slice(&batch_points[..]).unwrap();

c.bench_function(
#[cfg(feature = "g2")]
let g2_points = generate_random_points::<ExtensionField>(msm_size, get_rng(None));
#[cfg(feature = "g2")]
let g2_batch_points = vec![g2_points; batch_size].concat();
#[cfg(feature = "g2")]
let mut d_g2_points = DeviceBuffer::from_slice(&g2_batch_points[..]).unwrap();

group.sample_size(30).bench_function(
&format!("MSM of size 2^{} in batch {}", log_msm_size, batch_size),
|b| b.iter(|| commit_batch_bls12_381(&mut d_points, &mut d_scalars, batch_size)),
);

#[cfg(feature = "g2")]
group.sample_size(10).bench_function(
&format!("G2 MSM of size 2^{} in batch {}", log_msm_size, batch_size),
|b| b.iter(|| commit_batch_g2(&mut d_g2_points, &mut d_scalars, batch_size))
);
}
}
}
Expand Down
28 changes: 11 additions & 17 deletions benches/ntt.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,33 +8,27 @@ use icicle_utils::test_bls12_381::{interpolate_scalars_batch_bls12_381, interpol
const LOG_NTT_SIZES: [usize; 1] = [15];
const BATCH_SIZES: [usize; 2] = [8, 16];

fn bench_point_ntt(c: &mut Criterion) {
for log_ntt_size in LOG_NTT_SIZES {
for batch_size in BATCH_SIZES {
let ntt_size = 1 << log_ntt_size;
let (_, mut d_evals, mut d_domain) = set_up_points_bls12_381(ntt_size * batch_size, log_ntt_size, true);

c.bench_function(
&format!("EC NTT of size 2^{} in batch {}", log_ntt_size, batch_size),
|b| b.iter(|| interpolate_points_batch_bls12_381(&mut d_evals, &mut d_domain, batch_size))
);
}
}
}

fn bench_scalar_ntt(c: &mut Criterion) {
fn bench_ntt(c: &mut Criterion) {
let mut group = c.benchmark_group("NTT");
for log_ntt_size in LOG_NTT_SIZES {
for batch_size in BATCH_SIZES {
let ntt_size = 1 << log_ntt_size;
let (_, mut d_evals, mut d_domain) = set_up_scalars_bls12_381(ntt_size * batch_size, log_ntt_size, true);
let (_, mut d_points_evals, _) = set_up_points_bls12_381(ntt_size * batch_size, log_ntt_size, true);

c.bench_function(
group.sample_size(100).bench_function(
&format!("Scalar NTT of size 2^{} in batch {}", log_ntt_size, batch_size),
|b| b.iter(|| interpolate_scalars_batch_bls12_381(&mut d_evals, &mut d_domain, batch_size))
);

group.sample_size(10).bench_function(
&format!("EC NTT of size 2^{} in batch {}", log_ntt_size, batch_size),
|b| b.iter(|| interpolate_points_batch_bls12_381(&mut d_points_evals, &mut d_domain, batch_size))
);
}
}
}

criterion_group!(ntt_benches, bench_point_ntt, bench_scalar_ntt);
criterion_group!(ntt_benches, bench_ntt);
criterion_main!(ntt_benches);

3 changes: 3 additions & 0 deletions build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ fn main() {

println!("Compiling icicle library using arch: {}", &arch);

if cfg!(feature = "g2") {
nvcc.define("G2_DEFINED", None);
}
nvcc.cuda(true);
nvcc.debug(false);
nvcc.flag(&arch);
Expand Down
2 changes: 1 addition & 1 deletion curve_parameters/new_curve_script.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ def get_config_file_content(modolus_p, bit_count_p, limb_p, ntt_size, modolus_q,
#include <cuda.h>\n
#include "curve_config.cuh"\n
#include "../../primitives/projective.cuh"\n
extern "C" bool eq_CURVE_NAME_L(CURVE_NAME_U::projective_t *point1, CURVE_NAME_U::projective_t *point2, size_t device_id = 0)
extern "C" bool eq_CURVE_NAME_L(CURVE_NAME_U::projective_t *point1, CURVE_NAME_U::projective_t *point2)
{
return (*point1 == *point2);
}'''
Expand Down
7 changes: 3 additions & 4 deletions icicle/appUtils/ntt/lde.cu
Original file line number Diff line number Diff line change
Expand Up @@ -67,13 +67,13 @@ int evaluate_batch(E * d_out, E * d_coefficients, S * d_domain, unsigned domain_
if (domain_size > n) {
// allocate and initialize an array of stream handles to parallelize data copying across batches
cudaStream_t *memcpy_streams = (cudaStream_t *) malloc(batch_size * sizeof(cudaStream_t));
for (int i = 0; i < batch_size; i++)
for (unsigned i = 0; i < batch_size; i++)
{
cudaStreamCreate(&(memcpy_streams[i]));

cudaMemcpyAsync(&d_out[i * domain_size], &d_coefficients[i * n], n * sizeof(E), cudaMemcpyDeviceToDevice, memcpy_streams[i]);
int NUM_THREADS = MAX_THREADS_BATCH;
int NUM_BLOCKS = (domain_size - n + NUM_THREADS - 1) / NUM_THREADS;
uint32_t NUM_THREADS = MAX_THREADS_BATCH;
uint32_t NUM_BLOCKS = (domain_size - n + NUM_THREADS - 1) / NUM_THREADS;
fill_array <E> <<<NUM_BLOCKS, NUM_THREADS, 0, memcpy_streams[i]>>> (&d_out[i * domain_size + n], E::zero(), domain_size - n);

cudaStreamSynchronize(memcpy_streams[i]);
Expand Down Expand Up @@ -179,5 +179,4 @@ int evaluate_points_on_coset_batch(E* d_out, E* d_coefficients, S* d_domain, uns
unsigned n, unsigned batch_size, S* coset_powers) {
return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, true, coset_powers);
}

#endif
22 changes: 11 additions & 11 deletions icicle/appUtils/ntt/ntt.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ const uint32_t MAX_THREADS_BATCH = 256;
*/
__device__ __host__ uint32_t reverseBits(uint32_t num, uint32_t logn) {
unsigned int reverse_num = 0;
for (int i = 0; i < logn; i++) {
for (uint32_t i = 0; i < logn; i++) {
if ((num & (1 << i))) reverse_num |= 1 << ((logn - 1) - i);
}
return reverse_num;
Expand Down Expand Up @@ -159,9 +159,9 @@ template < typename E, typename S > void template_ntt_on_device_memory(E * d_arr
uint32_t m = 2;
for (uint32_t s = 0; s < logn; s++) {
for (uint32_t i = 0; i < n; i += m) {
int shifted_m = m >> 1;
int number_of_threads = MAX_NUM_THREADS ^ ((shifted_m ^ MAX_NUM_THREADS) & -(shifted_m < MAX_NUM_THREADS));
int number_of_blocks = shifted_m / MAX_NUM_THREADS + 1;
uint32_t shifted_m = m >> 1;
uint32_t number_of_threads = MAX_NUM_THREADS ^ ((shifted_m ^ MAX_NUM_THREADS) & -(shifted_m < MAX_NUM_THREADS));
uint32_t number_of_blocks = shifted_m / MAX_NUM_THREADS + 1;
template_butterfly_kernel < E, S > <<< number_of_threads, number_of_blocks >>> (d_arr, d_twiddles, n, n_twiddles, m, i, m >> 1);
}
m <<= 1;
Expand Down Expand Up @@ -229,14 +229,14 @@ template < typename E, typename S > E * ntt_template(E * arr, uint32_t n, S * d_
* @param logn log(n).
* @param task log(n).
*/
template < typename T > __device__ __host__ void reverseOrder_batch(T * arr, uint32_t n, uint32_t logn, uint32_t task) {
template < typename T > __device__ __host__ void reverseOrder_batch(T * arr, uint32_t n, uint32_t logn, uint32_t task) {
for (uint32_t i = 0; i < n; i++) {
uint32_t reversed = reverseBits(i, logn);
if (reversed > i) {
T tmp = arr[task * n + i];
arr[task * n + i] = arr[task * n + reversed];
arr[task * n + reversed] = tmp;
}
uint32_t reversed = reverseBits(i, logn);
if (reversed > i) {
T tmp = arr[task * n + i];
arr[task * n + i] = arr[task * n + reversed];
arr[task * n + reversed] = tmp;
}
}
}

Expand Down
20 changes: 18 additions & 2 deletions icicle/curves/bls12_377/curve_config.cuh
Original file line number Diff line number Diff line change
@@ -1,6 +1,22 @@
#pragma once

#include "../../primitives/field.cuh"
#include "../../primitives/projective.cuh"

#include "params.cuh"

namespace BLS12_377 {
typedef Field<PARAMS_BLS12_377::fp_config> scalar_field_t; typedef scalar_field_t scalar_t; typedef Field<PARAMS_BLS12_377::fq_config> point_field_t;
typedef Projective<point_field_t, scalar_field_t, PARAMS_BLS12_377::group_generator, PARAMS_BLS12_377::weierstrass_b> projective_t;
typedef Field<PARAMS_BLS12_377::fp_config> scalar_field_t;
typedef scalar_field_t scalar_t;
typedef Field<PARAMS_BLS12_377::fq_config> point_field_t;
static constexpr point_field_t b = point_field_t{ PARAMS_BLS12_377::weierstrass_b };
typedef Projective<point_field_t, scalar_field_t, b> projective_t;
typedef Affine<point_field_t> affine_t;
#if defined(G2_DEFINED)
typedef ExtensionField<PARAMS_BLS12_377::fq_config> g2_point_field_t;
static constexpr g2_point_field_t b_g2 = g2_point_field_t{ point_field_t{ PARAMS_BLS12_377::weierstrass_b_g2_re },
point_field_t{ PARAMS_BLS12_377::weierstrass_b_g2_im }};
typedef Projective<g2_point_field_t, scalar_field_t, b_g2> g2_projective_t;
typedef Affine<g2_point_field_t> g2_affine_t;
#endif
}
5 changes: 2 additions & 3 deletions icicle/curves/bls12_377/msm.cu
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ extern "C" int msm_batch_cuda_bls12_377(BLS12_377::projective_t* out, BLS12_377:
try
{
batched_large_msm<BLS12_377::scalar_t, BLS12_377::projective_t, BLS12_377::affine_t>(scalars, points, batch_size, msm_size, out, false);

return CUDA_SUCCESS;
}
catch (const std::runtime_error &ex)
Expand All @@ -51,7 +50,7 @@ extern "C" int msm_batch_cuda_bls12_377(BLS12_377::projective_t* out, BLS12_377:
{
try
{
large_msm(d_scalars, d_points, count, d_out, true);
large_msm<BLS12_377::scalar_t, BLS12_377::projective_t, BLS12_377::affine_t>(d_scalars, d_points, count, d_out, true);
return 0;
}
catch (const std::runtime_error &ex)
Expand Down Expand Up @@ -85,4 +84,4 @@ extern "C" int msm_batch_cuda_bls12_377(BLS12_377::projective_t* out, BLS12_377:
}
}

#endif
#endif
Loading

0 comments on commit b5a24d8

Please sign in to comment.