WIP

ingonyama-zk · Sep 23, 2024 · 67da239 · 67da239
1 parent 98dd414
commit 67da239
Show file tree

Hide file tree

Showing 7 changed files with 397 additions and 0 deletions.
diff --git a/icicle/include/fri/fri.cuh b/icicle/include/fri/fri.cuh
@@ -0,0 +1,59 @@
+#pragma once
+#ifndef FRI_H
+#define FRI_H
+
+#include <cuda_runtime.h>
+
+#include "gpu-utils/device_context.cuh"
+
+namespace fri {
+
+  struct FriConfig {
+    device_context::DeviceContext ctx;
+    bool are_evals_on_device;
+    bool are_domain_elements_on_device;
+    bool are_results_on_device;
+    bool is_async;
+  }
+
+  /**
+   * @brief Folds a layer's evaluation into a degree d/2 evaluation using the provided folding factor alpha.
+   *
+   * @param evals Pointer to the array of evaluation in the current FRI layer.
+   * @param domain_xs Pointer to a subset of line domain values.
+   * @param alpha The folding factor used in the FRI protocol.
+   * @param folded_evals Pointer to the array where the folded evaluations will be stored.
+   * @param n The number of evaluations in the original layer (before folding).
+   *
+   * @tparam S The scalar field type used for domain_xs.
+   * @tparam E The evaluation type, typically the same as the field element type.
+   *
+   * @note The size of the output array 'folded_evals' should be half of 'n', as folding reduces the number of evaluations by half.
+   */
+  template <typename S, typename E>
+  cudaError_t fold_line(E* eval, S* domain_xs, E alpha, E* folded_evals, int n, FriConfig cfg);
+
+  /**
+   * @brief Folds a layer of FRI evaluations from a circle into a line.
+   *
+   * This function performs the folding operation in the FRI (Fast Reed-Solomon IOP of Proximity) protocol,
+   * specifically for evaluations on a circle domain. It takes a layer of evaluations on a circle and folds
+   * them into a line using the provided folding factor alpha.
+   *
+   * @param evals Pointer to the array of evaluations in the current FRI layer, representing points on a circle.
+   * @param domain_ys Pointer to the array of y-coordinates of the circle points in the domain of the circle that evals represents.
+   * @param alpha The folding factor used in the FRI protocol.
+   * @param folded_evals Pointer to the array where the folded evaluations (now on a line) will be stored.
+   * @param n The number of evaluations in the original layer (before folding).
+   *
+   * @tparam S The scalar field type used for alpha and domain_ys.
+   * @tparam E The evaluation type, typically the same as the field element type.
+   *
+   * @note The size of the output array 'folded_evals' should be half of 'n', as folding reduces the number of evaluations by half.
+   * @note This function is specifically designed for folding evaluations from a circular domain to a linear domain.
+   */
+
+  template <typename S, typename E>
+  cudaError_t fold_circle_into_line(E* evals, S* domain_ys, S alpha, E* folded_evals, int n, FriConfig cfg);
+
+} // namespace fri
diff --git a/icicle/src/fields/CMakeLists.txt b/icicle/src/fields/CMakeLists.txt
@@ -4,6 +4,7 @@ endif ()
 
 SET(SUPPORTED_FIELDS_WITHOUT_NTT grumpkin;m31)
 SET(SUPPORTED_FIELDS_WITHOUT_POSEIDON2 bls12_381;bls12_377;grumpkin;bw6_761;stark252;m31)
+SET(SUPPORTED_FIELDS_WITH_FRI m31)
 
 set(TARGET icicle_field)
 
@@ -42,6 +43,11 @@ if (NOT FIELD IN_LIST SUPPORTED_FIELDS_WITHOUT_NTT)
   list(APPEND FIELD_SOURCE ${POLYNOMIAL_SOURCE_FILES}) # requires NTT  
 endif()
 
+if (FIELD IN_LIST SUPPORTED_FIELDS_WITH_FRI)
+  list(APPEND FIELD_SOURCE ${SRC}/fri/extern.cu)
+  list(APPEND FIELD_SOURCE ${SRC}/fri/fri.cu)
+endif()
+
 add_library(${TARGET} STATIC ${FIELD_SOURCE})
 target_include_directories(${TARGET} PUBLIC ${CMAKE_SOURCE_DIR}/include/)
 set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "ingo_field_${FIELD}")

diff --git a/icicle/src/fri/extern.cu b/icicle/src/fri/extern.cu
@@ -0,0 +1,38 @@
+#include "fri.cuh"
+#include "gpu-utils/device_context.cuh"
+
+namespace fri {
+  /**
+   * Extern "C" version of [fold_line](@ref fold_line) function with the following values of
+   * template parameters (where the field is given by `-DFIELD` env variable during build):
+   *  - `E` is the extension field type used for evaluations and alpha
+   *  - `S` is the scalar field type used for domain elements
+   * @param line_eval Pointer to the array of evaluations on the line
+   * @param domain_elements Pointer to the array of domain elements
+   * @param alpha The folding factor
+   * @param folded_evals Pointer to the array where folded evaluations will be stored
+   * @param n The number of evaluations
+   * @param ctx The device context; if the stream is not 0, then everything is run async
+   * @return `cudaSuccess` if the execution was successful and an error code otherwise.
+   */
+  extern "C" cudaError_t CONCAT_EXPAND(CURVE, fold_line)(extension_t* line_eval, scalar_t* domain_elements, extension_t alpha, extension_t* folded_evals, int n, FriConfig cfg) {
+    return fold_line(line_eval, domain_elements, alpha, folded_evals, n, cfg);
+  };
+
+  /**
+   * Extern "C" version of [fold_circle_into_line](@ref fold_circle_into_line) function with the following values of
+   * template parameters (where the field is given by `-DFIELD` env variable during build):
+   *  - `E` is the extension field type used for evaluations and alpha
+   *  - `S` is the scalar field type used for domain elements
+   * @param circle_evals Pointer to the array of evaluations on the circle
+   * @param domain_elements Pointer to the array of domain elements
+   * @param alpha The folding factor
+   * @param folded_line_evals Pointer to the array where folded evaluations will be stored
+   * @param n The number of evaluations
+   * @param ctx The device context; if the stream is not 0, then everything is run async
+   * @return `cudaSuccess` if the execution was successful and an error code otherwise.
+   */
+  extern "C" cudaError_t CONCAT_EXPAND(CURVE, fold_circle_into_line)(extension_t* circle_evals, scalar_t* domain_elements, extension_t alpha, extension_t* folded_line_evals, int n, FriConfig cfg) {
+    return fold_circle_into_line(circle_evals, domain_elements, alpha, folded_line_evals, n, cfg);
+  };
+} // namespace fri
diff --git a/icicle/src/fri/fri.cu b/icicle/src/fri/fri.cu
@@ -0,0 +1,142 @@
+#include <cuda_runtime.h>
+
+#include "fri.h"
+
+#include "fields/field.cuh"
+#include "gpu-utils/error_handler.cuh"
+#include "gpu-utils/device_context.cuh"
+
+namespace fri {
+
+    namespace {
+        template <typename S, typename E>
+        __device__ void ibutterfly(E& v0, E& v1, const S& itwid) {
+            E tmp = v0;
+            v0 = tmp + v1;
+            v1 = (tmp - v1) * itwid;
+        }
+
+        template <typename S, typename E>
+        __global__ void fold_line_kernel(E* evals, S* domain_xs, E alpha, E* folded_evals, int n) {
+            int idx = blockIdx.x * blockDim.x + threadIdx.x;
+            if (idx % 2 == 0 && idx < n) {
+                E f_x = evals[idx]; // even
+                E f_x_neg = evals[idx + 1]; // odd
+                ibutterfly(f_x, f_x_neg, domain_xs[idx / 2].inverse());
+                auto folded_evals_idx = idx / 2;
+                folded_evals[folded_evals_idx] = f_x + alpha * f_x_neg;
+            }
+        }
+
+        template <typename S, typename E>
+        __global__ void fold_circle_into_line_kernel(E* eval, S* domain_ys, S alpha, S alpha_sq, E* folded_evals, int n) {
+            int idx = blockIdx.x * blockDim.x + threadIdx.x;
+            if (idx % 2 == 0 && idx < n) {
+                E f0_px = eval[idx];
+                E f1_px = eval[idx + 1];
+                ibutterfly(f0_px, f1_px, domain_ys[idx / 2].inverse());
+                E f_prime = f0_px + alpha * f1_px;
+                auto folded_evals_idx = idx / 2;
+                folded_evals[folded_evals_idx] = folded_evals[folded_evals_idx] * alpha_sq + f_prime;
+            }
+        }
+    }
+
+    template <typename S, typename E>
+    cudaError_t fold_line(E* evals, S* domain_xs, E alpha, E* folded_evals, int n, FriConfig cfg) {
+        CHK_INIT_IF_RETURN();
+
+        cudaStream_t stream = cfg.ctx.stream;
+        // Allocate and move line domain evals to device if necessary
+        E* d_evals;
+        if (!cfg.are_evals_on_device) {
+            auto data_size = sizeof(E) * n;
+            CHK_IF_RETURN(cudaMallocAsync(&d_evals, data_size, stream));
+            CHK_IF_RETURN(cudaMemcpyAsync(d_evals, evals, data_size, cudaMemcpyHostToDevice, stream));
+        } else {
+            d_evals = evals;
+        }
+
+        // Allocate and move domain's elements to device if necessary
+        S* d_domain_ys;
+        if (!cfg.are_domain_elements_on_device) {
+            auto data_size = sizeof(S) * n/2;
+            CHK_IF_RETURN(cudaMallocAsync(&d_domain_ys, data_size, stream));
+            CHK_IF_RETURN(cudaMemcpyAsync(d_domain_ys, domain_ys, data_size, cudaMemcpyHostToDevice, stream));
+        } else {
+            d_domain_ys = domain_ys;
+        }
+
+        // Allocate folded_evals if pointer is not a device pointer
+        E* d_folded_evals;
+        if (!cfg.are_results_on_device) {
+            CHK_IF_RETURN(cudaMallocAsync(&d_folded_evals, sizeof(E) * n/2, stream));
+        } else {
+            d_out = out;
+        }
+
+        int block_size = 256;
+        int num_blocks = (n / 2 + block_size - 1) / block_size;
+        fold_line_kernel<<<num_blocks, block_size, 0 stream>>>(eval, domain_xs, alpha, folded_evals, n);
+
+        // Move folded_evals back to host if requested
+        if (!cfg.are_results_on_device) {
+            CHK_IF_RETURN(cudaMemcpyAsync(folded_evals, d_folded_evals, sizeof(E) * n/2, cudaMemcpyDeviceToHost, stream));
+        }
+
+        // Sync if stream is default stream
+        if (stream == 0) CHK_IF_RETURN(cudaStreamSynchronize(stream));
+
+        return CHK_LAST();
+    }
+
+    template <typename S, typename E>
+    cudaError_t fold_circle_into_line(E* evals, S* domain_ys, S alpha, E* folded_evals, int n, FriConfig cfg) {
+        CHK_INIT_IF_RETURN();
+
+        cudaStream_t stream = cfg.ctx.stream;
+        // Allocate and move circle domain evals to device if necessary
+        E* d_evals;
+        if (!cfg.are_evals_on_device) {
+            auto data_size = sizeof(E) * n;
+            CHK_IF_RETURN(cudaMallocAsync(&d_evals, data_size, stream));
+            CHK_IF_RETURN(cudaMemcpyAsync(d_evals, evals, data_size, cudaMemcpyHostToDevice, stream));
+        } else {
+            d_evals = evals;
+        }
+
+        // Allocate and move domain's elements to device if necessary
+        S* d_domain_ys;
+        if (!cfg.are_domain_elements_on_device) {
+            auto data_size = sizeof(S) * n/2;
+            CHK_IF_RETURN(cudaMallocAsync(&d_domain_ys, data_size, stream));
+            CHK_IF_RETURN(cudaMemcpyAsync(d_domain_ys, domain_ys, data_size, cudaMemcpyHostToDevice, stream));
+        } else {
+            d_domain_ys = domain_ys;
+        }
+
+        // Allocate folded_evals if pointer is not a device pointer
+        E* d_folded_evals;
+        bool folded_evals_on_host = is_host_ptr(folded_evals);
+        if (!cfg.are_results_on_device) {
+            CHK_IF_RETURN(cudaMallocAsync(&d_folded_evals, sizeof(E) * n/2, stream));
+        } else {
+            d_out = out;
+        }
+
+        S alpha_sq = alpha * alpha;
+        int block_size = 256;
+        int num_blocks = (n / 2 + block_size - 1) / block_size;
+        fold_circle_into_line_kernel<<<num_blocks, block_size, 0 stream>>>(d_evals, d_domain_ys, alpha, alpha_sq, d_folded_evals, n);
+
+        // Move folded_evals back to host if requested
+        if (!cfg.are_results_on_device) {
+            CHK_IF_RETURN(cudaMemcpyAsync(folded_evals, d_folded_evals, sizeof(E) * n/2, cudaMemcpyDeviceToHost, stream));
+        }
+
+        // Sync if stream is default stream
+        if (stream == 0) CHK_IF_RETURN(cudaStreamSynchronize(stream));
+
+        return CHK_LAST();
+    }
+} // namespace fri
diff --git a/wrappers/rust/icicle-fields/icicle-m31/Cargo.toml b/wrappers/rust/icicle-fields/icicle-m31/Cargo.toml
@@ -17,3 +17,4 @@ cmake = "0.1.50"
 [features]
 default = []
 devmode = ["icicle-core/devmode"]
+fri = []