Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dev v2 #95

Merged
merged 6 commits into from
Jun 1, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Add streams capability (#89)
  • Loading branch information
jeremyfelder committed Jun 1, 2023
commit 20de60fd431daeca821f9d3588b1954af398a8bc
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@ This PR...

## Linked Issues

Closes #
Resolves #
1 change: 1 addition & 0 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ on:
env:
CARGO_TERM_COLOR: always
ARCH_TYPE: sm_70
DEFAULT_STREAM: per-thread

jobs:
build-linux:
Expand Down
4 changes: 4 additions & 0 deletions build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,12 @@ fn main() {
println!("cargo:rerun-if-changed=./icicle");

let arch_type = env::var("ARCH_TYPE").unwrap_or(String::from("native"));
let stream_type = env::var("DEFAULT_STREAM").unwrap_or(String::from("legacy"));

let mut arch = String::from("-arch=");
arch.push_str(&arch_type);
let mut stream = String::from("-default-stream=");
stream.push_str(&stream_type);

let mut nvcc = cc::Build::new();

Expand All @@ -22,6 +25,7 @@ fn main() {
nvcc.cuda(true);
nvcc.debug(false);
nvcc.flag(&arch);
nvcc.flag(&stream);
nvcc.files([
"./icicle/curves/index.cu",
]);
Expand Down
263 changes: 136 additions & 127 deletions icicle/appUtils/msm/msm.cu

Large diffs are not rendered by default.

10 changes: 5 additions & 5 deletions icicle/appUtils/msm/msm.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,19 @@
#pragma once

template <typename S, typename P, typename A>
void bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsigned size, P* final_result, bool on_device);
void bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsigned size, P* final_result, bool on_device, cudaStream_t stream);

template <typename S, typename P, typename A>
void batched_bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsigned batch_size, unsigned msm_size, P* final_results, bool on_device);
void batched_bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsigned batch_size, unsigned msm_size, P* final_results, bool on_device, cudaStream_t stream);

template <typename S, typename P, typename A>
void batched_large_msm(S* scalars, A* points, unsigned batch_size, unsigned msm_size, P* result, bool on_device);
void batched_large_msm(S* scalars, A* points, unsigned batch_size, unsigned msm_size, P* result, bool on_device, cudaStream_t stream);

template <typename S, typename P, typename A>
void large_msm(S* scalars, A* points, unsigned size, P* result, bool on_device);
void large_msm(S* scalars, A* points, unsigned size, P* result, bool on_device, cudaStream_t stream);

template <typename S, typename P, typename A>
void short_msm(S *h_scalars, A *h_points, unsigned size, P* h_final_result, bool on_device);
void short_msm(S *h_scalars, A *h_points, unsigned size, P* h_final_result, cudaStream_t stream);

template <typename A, typename S, typename P>
void reference_msm(S* scalars, A* a_points, unsigned size);
Expand Down
74 changes: 38 additions & 36 deletions icicle/appUtils/ntt/lde.cu
Original file line number Diff line number Diff line change
Expand Up @@ -15,19 +15,20 @@
* @param n Length of `d_domain` array, also equal to the number of evaluations of each polynomial.
* @param batch_size The size of the batch; the length of `d_evaluations` is `n` * `batch_size`.
*/
template <typename E, typename S> int interpolate_batch(E * d_out, E * d_evaluations, S * d_domain, unsigned n, unsigned batch_size) {
template <typename E, typename S> int interpolate_batch(E * d_out, E * d_evaluations, S * d_domain, unsigned n, unsigned batch_size, cudaStream_t stream) {
uint32_t logn = uint32_t(log(n) / log(2));
cudaMemcpy(d_out, d_evaluations, sizeof(E) * n * batch_size, cudaMemcpyDeviceToDevice);
cudaMemcpyAsync(d_out, d_evaluations, sizeof(E) * n * batch_size, cudaMemcpyDeviceToDevice, stream);

int NUM_THREADS = min(n / 2, MAX_THREADS_BATCH);
int NUM_BLOCKS = batch_size * max(int((n / 2) / NUM_THREADS), 1);
for (uint32_t s = 0; s < logn; s++) //TODO: this loop also can be unrolled
{
ntt_template_kernel <E, S> <<<NUM_BLOCKS, NUM_THREADS>>>(d_out, n, d_domain, n, NUM_BLOCKS, s, false);
ntt_template_kernel <E, S> <<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(d_out, n, d_domain, n, NUM_BLOCKS, s, false);
}

NUM_BLOCKS = (n * batch_size + NUM_THREADS - 1) / NUM_THREADS;
template_normalize_kernel <E, S> <<<NUM_BLOCKS, NUM_THREADS>>> (d_out, n * batch_size, S::inv_log_size(logn));
template_normalize_kernel <E, S> <<<NUM_BLOCKS, NUM_THREADS, 0, stream>>> (d_out, n * batch_size, S::inv_log_size(logn));
cudaStreamSynchronize(stream);
return 0;
}

Expand All @@ -39,8 +40,8 @@ template <typename E, typename S> int interpolate_batch(E * d_out, E * d_evaluat
* @param d_domain Domain on which the polynomial is evaluated. Must be a subgroup.
* @param n Length of `d_evaluations` and the size `d_domain` arrays (they should have equal length).
*/
template <typename E, typename S> int interpolate(E * d_out, E * d_evaluations, S * d_domain, unsigned n) {
return interpolate_batch <E, S> (d_out, d_evaluations, d_domain, n, 1);
template <typename E, typename S> int interpolate(E * d_out, E * d_evaluations, S * d_domain, unsigned n, cudaStream_t stream) {
return interpolate_batch <E, S> (d_out, d_evaluations, d_domain, n, 1, stream);
}

template < typename E > __global__ void fill_array(E * arr, E val, uint32_t n) {
Expand All @@ -62,7 +63,7 @@ template < typename E > __global__ void fill_array(E * arr, E val, uint32_t n) {
* @param coset_powers If `coset` is true, a list of powers `[1, u, u^2, ..., u^{n-1}]` where `u` is the generator of the coset.
*/
template <typename E, typename S>
int evaluate_batch(E * d_out, E * d_coefficients, S * d_domain, unsigned domain_size, unsigned n, unsigned batch_size, bool coset, S * coset_powers) {
int evaluate_batch(E * d_out, E * d_coefficients, S * d_domain, unsigned domain_size, unsigned n, unsigned batch_size, bool coset, S * coset_powers, cudaStream_t stream) {
uint32_t logn = uint32_t(log(domain_size) / log(2));
if (domain_size > n) {
// allocate and initialize an array of stream handles to parallelize data copying across batches
Expand All @@ -80,18 +81,19 @@ int evaluate_batch(E * d_out, E * d_coefficients, S * d_domain, unsigned domain_
cudaStreamDestroy(memcpy_streams[i]);
}
} else
cudaMemcpy(d_out, d_coefficients, sizeof(E) * domain_size * batch_size, cudaMemcpyDeviceToDevice);
cudaMemcpyAsync(d_out, d_coefficients, sizeof(E) * domain_size * batch_size, cudaMemcpyDeviceToDevice, stream);

if (coset)
batch_vector_mult(coset_powers, d_out, domain_size, batch_size);
batch_vector_mult(coset_powers, d_out, domain_size, batch_size, stream);

int NUM_THREADS = min(domain_size / 2, MAX_THREADS_BATCH);
int chunks = max(int((domain_size / 2) / NUM_THREADS), 1);
int NUM_BLOCKS = batch_size * chunks;
for (uint32_t s = 0; s < logn; s++) //TODO: this loop also can be unrolled
{
ntt_template_kernel <E, S> <<<NUM_BLOCKS, NUM_THREADS>>>(d_out, domain_size, d_domain, domain_size, batch_size * chunks, logn - s - 1, true);
ntt_template_kernel <E, S> <<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(d_out, domain_size, d_domain, domain_size, batch_size * chunks, logn - s - 1, true);
}
cudaStreamSynchronize(stream);
return 0;
}

Expand All @@ -107,76 +109,76 @@ int evaluate_batch(E * d_out, E * d_coefficients, S * d_domain, unsigned domain_
* @param coset_powers If `coset` is true, a list of powers `[1, u, u^2, ..., u^{n-1}]` where `u` is the generator of the coset.
*/
template <typename E, typename S>
int evaluate(E * d_out, E * d_coefficients, S * d_domain, unsigned domain_size, unsigned n, bool coset, S * coset_powers) {
return evaluate_batch <E, S> (d_out, d_coefficients, d_domain, domain_size, n, 1, coset, coset_powers);
int evaluate(E * d_out, E * d_coefficients, S * d_domain, unsigned domain_size, unsigned n, bool coset, S * coset_powers, cudaStream_t stream) {
return evaluate_batch <E, S> (d_out, d_coefficients, d_domain, domain_size, n, 1, coset, coset_powers, stream);
}

template <typename S>
int interpolate_scalars(S* d_out, S* d_evaluations, S* d_domain, unsigned n) {
return interpolate(d_out, d_evaluations, d_domain, n);
int interpolate_scalars(S* d_out, S* d_evaluations, S* d_domain, unsigned n, cudaStream_t stream) {
return interpolate(d_out, d_evaluations, d_domain, n, stream);
}

template <typename S>
int interpolate_scalars_batch(S* d_out, S* d_evaluations, S* d_domain, unsigned n, unsigned batch_size) {
return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size);
int interpolate_scalars_batch(S* d_out, S* d_evaluations, S* d_domain, unsigned n, unsigned batch_size, cudaStream_t stream) {
return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, stream);
}

template <typename E, typename S>
int interpolate_points(E* d_out, E* d_evaluations, S* d_domain, unsigned n) {
return interpolate(d_out, d_evaluations, d_domain, n);
int interpolate_points(E* d_out, E* d_evaluations, S* d_domain, unsigned n, cudaStream_t stream) {
return interpolate(d_out, d_evaluations, d_domain, n, stream);
}

template <typename E, typename S>
int interpolate_points_batch(E* d_out, E* d_evaluations, S* d_domain, unsigned n, unsigned batch_size) {
return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size);
int interpolate_points_batch(E* d_out, E* d_evaluations, S* d_domain, unsigned n, unsigned batch_size, cudaStream_t stream) {
return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, stream);
}

template <typename S>
int evaluate_scalars(S* d_out, S* d_coefficients, S* d_domain, unsigned domain_size, unsigned n) {
int evaluate_scalars(S* d_out, S* d_coefficients, S* d_domain, unsigned domain_size, unsigned n, cudaStream_t stream) {
S* _null = nullptr;
return evaluate(d_out, d_coefficients, d_domain, domain_size, n, false, _null);
return evaluate(d_out, d_coefficients, d_domain, domain_size, n, false, _null, stream);
}

template <typename S>
int evaluate_scalars_batch(S* d_out, S* d_coefficients, S* d_domain, unsigned domain_size, unsigned n, unsigned batch_size) {
int evaluate_scalars_batch(S* d_out, S* d_coefficients, S* d_domain, unsigned domain_size, unsigned n, unsigned batch_size, cudaStream_t stream) {
S* _null = nullptr;
return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, false, _null);
return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, false, _null, stream);
}

template <typename E, typename S>
int evaluate_points(E* d_out, E* d_coefficients, S* d_domain, unsigned domain_size, unsigned n) {
int evaluate_points(E* d_out, E* d_coefficients, S* d_domain, unsigned domain_size, unsigned n, cudaStream_t stream) {
S* _null = nullptr;
return evaluate(d_out, d_coefficients, d_domain, domain_size, n, false, _null);
return evaluate(d_out, d_coefficients, d_domain, domain_size, n, false, _null, stream);
}

template <typename E, typename S>
int evaluate_points_batch(E* d_out, E* d_coefficients, S* d_domain,
unsigned domain_size, unsigned n, unsigned batch_size) {
unsigned domain_size, unsigned n, unsigned batch_size, cudaStream_t stream) {
S* _null = nullptr;
return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, false, _null);
return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, false, _null, stream);
}

template <typename S>
int evaluate_scalars_on_coset(S* d_out, S* d_coefficients, S* d_domain,
unsigned domain_size, unsigned n, S* coset_powers) {
return evaluate(d_out, d_coefficients, d_domain, domain_size, n, true, coset_powers);
unsigned domain_size, unsigned n, S* coset_powers, cudaStream_t stream) {
return evaluate(d_out, d_coefficients, d_domain, domain_size, n, true, coset_powers, stream);
}

template <typename E, typename S>
int evaluate_scalars_on_coset_batch(S* d_out, S* d_coefficients, S* d_domain, unsigned domain_size,
unsigned n, unsigned batch_size, S* coset_powers) {
return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, true, coset_powers);
unsigned n, unsigned batch_size, S* coset_powers, cudaStream_t stream) {
return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, true, coset_powers, stream);
}

template <typename E, typename S>
int evaluate_points_on_coset(E* d_out, E* d_coefficients, S* d_domain,
unsigned domain_size, unsigned n, S* coset_powers) {
return evaluate(d_out, d_coefficients, d_domain, domain_size, n, true, coset_powers);
unsigned domain_size, unsigned n, S* coset_powers, cudaStream_t stream) {
return evaluate(d_out, d_coefficients, d_domain, domain_size, n, true, coset_powers, stream);
}

template <typename E, typename S>
int evaluate_points_on_coset_batch(E* d_out, E* d_coefficients, S* d_domain, unsigned domain_size,
unsigned n, unsigned batch_size, S* coset_powers) {
return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, true, coset_powers);
unsigned n, unsigned batch_size, S* coset_powers, cudaStream_t stream) {
return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, true, coset_powers, stream);
}
#endif
24 changes: 12 additions & 12 deletions icicle/appUtils/ntt/lde.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -3,44 +3,44 @@
#pragma once

template <typename S>
int interpolate_scalars(S* d_out, S* d_evaluations, S* d_domain, unsigned n);
int interpolate_scalars(S* d_out, S* d_evaluations, S* d_domain, unsigned n, cudaStream_t stream);

template <typename S>
int interpolate_scalars_batch(S* d_out, S* d_evaluations, S* d_domain, unsigned n, unsigned batch_size);
int interpolate_scalars_batch(S* d_out, S* d_evaluations, S* d_domain, unsigned n, unsigned batch_size, cudaStream_t stream);

template <typename E, typename S>
int interpolate_points(E* d_out, E* d_evaluations, S* d_domain, unsigned n);
int interpolate_points(E* d_out, E* d_evaluations, S* d_domain, unsigned n, cudaStream_t stream);

template <typename E, typename S>
int interpolate_points_batch(E* d_out, E* d_evaluations, S* d_domain, unsigned n, unsigned batch_size);
int interpolate_points_batch(E* d_out, E* d_evaluations, S* d_domain, unsigned n, unsigned batch_size, cudaStream_t stream);

template <typename S>
int evaluate_scalars(S* d_out, S* d_coefficients, S* d_domain, unsigned domain_size, unsigned n);
int evaluate_scalars(S* d_out, S* d_coefficients, S* d_domain, unsigned domain_size, unsigned n, cudaStream_t stream);

template <typename S>
int evaluate_scalars_batch(S* d_out, S* d_coefficients, S* d_domain, unsigned domain_size, unsigned n, unsigned batch_size);
int evaluate_scalars_batch(S* d_out, S* d_coefficients, S* d_domain, unsigned domain_size, unsigned n, unsigned batch_size, cudaStream_t stream);

template <typename E, typename S>
int evaluate_points(E* d_out, E* d_coefficients, S* d_domain, unsigned domain_size, unsigned n);
int evaluate_points(E* d_out, E* d_coefficients, S* d_domain, unsigned domain_size, unsigned n, cudaStream_t stream);

template <typename E, typename S>
int evaluate_points_batch(E* d_out, E* d_coefficients, S* d_domain,
unsigned domain_size, unsigned n, unsigned batch_size);
unsigned domain_size, unsigned n, unsigned batch_size, cudaStream_t stream);

template <typename S>
int evaluate_scalars_on_coset(S* d_out, S* d_coefficients, S* d_domain,
unsigned domain_size, unsigned n, S* coset_powers);
unsigned domain_size, unsigned n, S* coset_powers, cudaStream_t stream);

template <typename S>
int evaluate_scalars_on_coset_batch(S* d_out, S* d_coefficients, S* d_domain, unsigned domain_size,
unsigned n, unsigned batch_size, S* coset_powers);
unsigned n, unsigned batch_size, S* coset_powers, cudaStream_t stream);

template <typename E, typename S>
int evaluate_points_on_coset(E* d_out, E* d_coefficients, S* d_domain,
unsigned domain_size, unsigned n, S* coset_powers);
unsigned domain_size, unsigned n, S* coset_powers, cudaStream_t stream);

template <typename E, typename S>
int evaluate_points_on_coset_batch(E* d_out, E* d_coefficients, S* d_domain, unsigned domain_size,
unsigned n, unsigned batch_size, S* coset_powers);
unsigned n, unsigned batch_size, S* coset_powers, cudaStream_t stream);

#endif
Loading