Skip to content

Commit

Permalink
WIP
Browse files Browse the repository at this point in the history
  • Loading branch information
jeremyfelder committed Sep 23, 2024
1 parent 98dd414 commit 67da239
Show file tree
Hide file tree
Showing 7 changed files with 397 additions and 0 deletions.
59 changes: 59 additions & 0 deletions icicle/include/fri/fri.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
#pragma once
#ifndef FRI_H
#define FRI_H

#include <cuda_runtime.h>

#include "gpu-utils/device_context.cuh"

namespace fri {

struct FriConfig {
device_context::DeviceContext ctx;
bool are_evals_on_device;
bool are_domain_elements_on_device;
bool are_results_on_device;
bool is_async;
}

/**
* @brief Folds a layer's evaluation into a degree d/2 evaluation using the provided folding factor alpha.
*
* @param evals Pointer to the array of evaluation in the current FRI layer.
* @param domain_xs Pointer to a subset of line domain values.
* @param alpha The folding factor used in the FRI protocol.
* @param folded_evals Pointer to the array where the folded evaluations will be stored.
* @param n The number of evaluations in the original layer (before folding).
*
* @tparam S The scalar field type used for domain_xs.
* @tparam E The evaluation type, typically the same as the field element type.
*
* @note The size of the output array 'folded_evals' should be half of 'n', as folding reduces the number of evaluations by half.
*/
template <typename S, typename E>
cudaError_t fold_line(E* eval, S* domain_xs, E alpha, E* folded_evals, int n, FriConfig cfg);

/**
* @brief Folds a layer of FRI evaluations from a circle into a line.
*
* This function performs the folding operation in the FRI (Fast Reed-Solomon IOP of Proximity) protocol,
* specifically for evaluations on a circle domain. It takes a layer of evaluations on a circle and folds
* them into a line using the provided folding factor alpha.
*
* @param evals Pointer to the array of evaluations in the current FRI layer, representing points on a circle.
* @param domain_ys Pointer to the array of y-coordinates of the circle points in the domain of the circle that evals represents.
* @param alpha The folding factor used in the FRI protocol.
* @param folded_evals Pointer to the array where the folded evaluations (now on a line) will be stored.
* @param n The number of evaluations in the original layer (before folding).
*
* @tparam S The scalar field type used for alpha and domain_ys.
* @tparam E The evaluation type, typically the same as the field element type.
*
* @note The size of the output array 'folded_evals' should be half of 'n', as folding reduces the number of evaluations by half.
* @note This function is specifically designed for folding evaluations from a circular domain to a linear domain.
*/

template <typename S, typename E>
cudaError_t fold_circle_into_line(E* evals, S* domain_ys, S alpha, E* folded_evals, int n, FriConfig cfg);

} // namespace fri
6 changes: 6 additions & 0 deletions icicle/src/fields/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ endif ()

SET(SUPPORTED_FIELDS_WITHOUT_NTT grumpkin;m31)
SET(SUPPORTED_FIELDS_WITHOUT_POSEIDON2 bls12_381;bls12_377;grumpkin;bw6_761;stark252;m31)
SET(SUPPORTED_FIELDS_WITH_FRI m31)

set(TARGET icicle_field)

Expand Down Expand Up @@ -42,6 +43,11 @@ if (NOT FIELD IN_LIST SUPPORTED_FIELDS_WITHOUT_NTT)
list(APPEND FIELD_SOURCE ${POLYNOMIAL_SOURCE_FILES}) # requires NTT
endif()

if (FIELD IN_LIST SUPPORTED_FIELDS_WITH_FRI)
list(APPEND FIELD_SOURCE ${SRC}/fri/extern.cu)
list(APPEND FIELD_SOURCE ${SRC}/fri/fri.cu)
endif()

add_library(${TARGET} STATIC ${FIELD_SOURCE})
target_include_directories(${TARGET} PUBLIC ${CMAKE_SOURCE_DIR}/include/)
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "ingo_field_${FIELD}")
Expand Down
38 changes: 38 additions & 0 deletions icicle/src/fri/extern.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#include "fri.cuh"
#include "gpu-utils/device_context.cuh"

namespace fri {
/**
* Extern "C" version of [fold_line](@ref fold_line) function with the following values of
* template parameters (where the field is given by `-DFIELD` env variable during build):
* - `E` is the extension field type used for evaluations and alpha
* - `S` is the scalar field type used for domain elements
* @param line_eval Pointer to the array of evaluations on the line
* @param domain_elements Pointer to the array of domain elements
* @param alpha The folding factor
* @param folded_evals Pointer to the array where folded evaluations will be stored
* @param n The number of evaluations
* @param ctx The device context; if the stream is not 0, then everything is run async
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
*/
extern "C" cudaError_t CONCAT_EXPAND(CURVE, fold_line)(extension_t* line_eval, scalar_t* domain_elements, extension_t alpha, extension_t* folded_evals, int n, FriConfig cfg) {
return fold_line(line_eval, domain_elements, alpha, folded_evals, n, cfg);
};

/**
* Extern "C" version of [fold_circle_into_line](@ref fold_circle_into_line) function with the following values of
* template parameters (where the field is given by `-DFIELD` env variable during build):
* - `E` is the extension field type used for evaluations and alpha
* - `S` is the scalar field type used for domain elements
* @param circle_evals Pointer to the array of evaluations on the circle
* @param domain_elements Pointer to the array of domain elements
* @param alpha The folding factor
* @param folded_line_evals Pointer to the array where folded evaluations will be stored
* @param n The number of evaluations
* @param ctx The device context; if the stream is not 0, then everything is run async
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
*/
extern "C" cudaError_t CONCAT_EXPAND(CURVE, fold_circle_into_line)(extension_t* circle_evals, scalar_t* domain_elements, extension_t alpha, extension_t* folded_line_evals, int n, FriConfig cfg) {
return fold_circle_into_line(circle_evals, domain_elements, alpha, folded_line_evals, n, cfg);
};
} // namespace fri
142 changes: 142 additions & 0 deletions icicle/src/fri/fri.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
#include <cuda_runtime.h>

#include "fri.h"

#include "fields/field.cuh"
#include "gpu-utils/error_handler.cuh"
#include "gpu-utils/device_context.cuh"

namespace fri {

namespace {
template <typename S, typename E>
__device__ void ibutterfly(E& v0, E& v1, const S& itwid) {
E tmp = v0;
v0 = tmp + v1;
v1 = (tmp - v1) * itwid;
}

template <typename S, typename E>
__global__ void fold_line_kernel(E* evals, S* domain_xs, E alpha, E* folded_evals, int n) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx % 2 == 0 && idx < n) {
E f_x = evals[idx]; // even
E f_x_neg = evals[idx + 1]; // odd
ibutterfly(f_x, f_x_neg, domain_xs[idx / 2].inverse());
auto folded_evals_idx = idx / 2;
folded_evals[folded_evals_idx] = f_x + alpha * f_x_neg;
}
}

template <typename S, typename E>
__global__ void fold_circle_into_line_kernel(E* eval, S* domain_ys, S alpha, S alpha_sq, E* folded_evals, int n) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx % 2 == 0 && idx < n) {
E f0_px = eval[idx];
E f1_px = eval[idx + 1];
ibutterfly(f0_px, f1_px, domain_ys[idx / 2].inverse());
E f_prime = f0_px + alpha * f1_px;
auto folded_evals_idx = idx / 2;
folded_evals[folded_evals_idx] = folded_evals[folded_evals_idx] * alpha_sq + f_prime;
}
}
}

template <typename S, typename E>
cudaError_t fold_line(E* evals, S* domain_xs, E alpha, E* folded_evals, int n, FriConfig cfg) {
CHK_INIT_IF_RETURN();

cudaStream_t stream = cfg.ctx.stream;
// Allocate and move line domain evals to device if necessary
E* d_evals;
if (!cfg.are_evals_on_device) {
auto data_size = sizeof(E) * n;
CHK_IF_RETURN(cudaMallocAsync(&d_evals, data_size, stream));
CHK_IF_RETURN(cudaMemcpyAsync(d_evals, evals, data_size, cudaMemcpyHostToDevice, stream));
} else {
d_evals = evals;
}

// Allocate and move domain's elements to device if necessary
S* d_domain_ys;
if (!cfg.are_domain_elements_on_device) {
auto data_size = sizeof(S) * n/2;
CHK_IF_RETURN(cudaMallocAsync(&d_domain_ys, data_size, stream));
CHK_IF_RETURN(cudaMemcpyAsync(d_domain_ys, domain_ys, data_size, cudaMemcpyHostToDevice, stream));
} else {
d_domain_ys = domain_ys;
}

// Allocate folded_evals if pointer is not a device pointer
E* d_folded_evals;
if (!cfg.are_results_on_device) {
CHK_IF_RETURN(cudaMallocAsync(&d_folded_evals, sizeof(E) * n/2, stream));
} else {
d_out = out;
}

int block_size = 256;
int num_blocks = (n / 2 + block_size - 1) / block_size;
fold_line_kernel<<<num_blocks, block_size, 0 stream>>>(eval, domain_xs, alpha, folded_evals, n);

// Move folded_evals back to host if requested
if (!cfg.are_results_on_device) {
CHK_IF_RETURN(cudaMemcpyAsync(folded_evals, d_folded_evals, sizeof(E) * n/2, cudaMemcpyDeviceToHost, stream));
}

// Sync if stream is default stream
if (stream == 0) CHK_IF_RETURN(cudaStreamSynchronize(stream));

return CHK_LAST();
}

template <typename S, typename E>
cudaError_t fold_circle_into_line(E* evals, S* domain_ys, S alpha, E* folded_evals, int n, FriConfig cfg) {
CHK_INIT_IF_RETURN();

cudaStream_t stream = cfg.ctx.stream;
// Allocate and move circle domain evals to device if necessary
E* d_evals;
if (!cfg.are_evals_on_device) {
auto data_size = sizeof(E) * n;
CHK_IF_RETURN(cudaMallocAsync(&d_evals, data_size, stream));
CHK_IF_RETURN(cudaMemcpyAsync(d_evals, evals, data_size, cudaMemcpyHostToDevice, stream));
} else {
d_evals = evals;
}

// Allocate and move domain's elements to device if necessary
S* d_domain_ys;
if (!cfg.are_domain_elements_on_device) {
auto data_size = sizeof(S) * n/2;
CHK_IF_RETURN(cudaMallocAsync(&d_domain_ys, data_size, stream));
CHK_IF_RETURN(cudaMemcpyAsync(d_domain_ys, domain_ys, data_size, cudaMemcpyHostToDevice, stream));
} else {
d_domain_ys = domain_ys;
}

// Allocate folded_evals if pointer is not a device pointer
E* d_folded_evals;
bool folded_evals_on_host = is_host_ptr(folded_evals);
if (!cfg.are_results_on_device) {
CHK_IF_RETURN(cudaMallocAsync(&d_folded_evals, sizeof(E) * n/2, stream));
} else {
d_out = out;
}

S alpha_sq = alpha * alpha;
int block_size = 256;
int num_blocks = (n / 2 + block_size - 1) / block_size;
fold_circle_into_line_kernel<<<num_blocks, block_size, 0 stream>>>(d_evals, d_domain_ys, alpha, alpha_sq, d_folded_evals, n);

// Move folded_evals back to host if requested
if (!cfg.are_results_on_device) {
CHK_IF_RETURN(cudaMemcpyAsync(folded_evals, d_folded_evals, sizeof(E) * n/2, cudaMemcpyDeviceToHost, stream));
}

// Sync if stream is default stream
if (stream == 0) CHK_IF_RETURN(cudaStreamSynchronize(stream));

return CHK_LAST();
}
} // namespace fri
1 change: 1 addition & 0 deletions wrappers/rust/icicle-fields/icicle-m31/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,4 @@ cmake = "0.1.50"
[features]
default = []
devmode = ["icicle-core/devmode"]
fri = []
Loading

0 comments on commit 67da239

Please sign in to comment.