Skip to content

Commit

Permalink
Mixed-radix NTT algorithm
Browse files Browse the repository at this point in the history
Co-authored-by: hadaringonyama <hadar@ingonyama.com>
  • Loading branch information
yshekel and HadarIngonyama committed Feb 8, 2024
1 parent b20ef93 commit 3582df2
Show file tree
Hide file tree
Showing 20 changed files with 1,733 additions and 80 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,5 @@
**/icicle/build/
**/wrappers/rust/icicle-cuda-runtime/src/bindings.rs
**/build
**/icicle/appUtils/large_ntt/work
icicle/appUtils/large_ntt/work/test_ntt
39 changes: 21 additions & 18 deletions examples/c++/ntt/example.cu
Original file line number Diff line number Diff line change
Expand Up @@ -5,36 +5,39 @@
#define CURVE_ID 1
// include NTT template
#include "appUtils/ntt/ntt.cu"
#include "appUtils/ntt/kernel_ntt.cu"
using namespace curve_config;

// Operate on scalars
typedef scalar_t S;
typedef scalar_t E;

void print_elements(const unsigned n, E * elements ) {
void print_elements(const unsigned n, E* elements)
{
for (unsigned i = 0; i < n; i++) {
std::cout << i << ": " << elements[i] << std::endl;
std::cout << i << ": " << elements[i] << std::endl;
}
}

void initialize_input(const unsigned ntt_size, const unsigned nof_ntts, E * elements ) {
void initialize_input(const unsigned ntt_size, const unsigned nof_ntts, E* elements)
{
// Lowest Harmonics
for (unsigned i = 0; i < ntt_size; i=i+1) {
for (unsigned i = 0; i < ntt_size; i = i + 1) {
elements[i] = E::one();
}
// print_elements(ntt_size, elements );
// Highest Harmonics
for (unsigned i = 1*ntt_size; i < 2*ntt_size; i=i+2) {
elements[i] = E::one();
elements[i+1] = E::neg(scalar_t::one());
for (unsigned i = 1 * ntt_size; i < 2 * ntt_size; i = i + 2) {
elements[i] = E::one();
elements[i + 1] = E::neg(scalar_t::one());
}
// print_elements(ntt_size, &elements[1*ntt_size] );
}

int validate_output(const unsigned ntt_size, const unsigned nof_ntts, E* elements)
{
int nof_errors = 0;
E amplitude = E::from((uint32_t) ntt_size);
E amplitude = E::from((uint32_t)ntt_size);
// std::cout << "Amplitude: " << amplitude << std::endl;
// Lowest Harmonics
if (elements[0] != amplitude) {
Expand All @@ -44,8 +47,8 @@ int validate_output(const unsigned ntt_size, const unsigned nof_ntts, E* element
} else {
std::cout << "Validated lowest harmonics" << std::endl;
}
// Highest Harmonics
if (elements[1*ntt_size+ntt_size/2] != amplitude) {
// Highest Harmonics
if (elements[1 * ntt_size + ntt_size / 2] != amplitude) {
++nof_errors;
std::cout << "Error in highest harmonics! " << std::endl;
// print_elements(ntt_size, &elements[1*ntt_size] );
Expand All @@ -66,32 +69,32 @@ int main(int argc, char* argv[])
const unsigned nof_ntts = 2;
std::cout << "Number of NTTs: " << nof_ntts << std::endl;
const unsigned batch_size = nof_ntts * ntt_size;

std::cout << "Generating input data for lowest and highest harmonics" << std::endl;
E* input;
input = (E*) malloc(sizeof(E) * batch_size);
initialize_input(ntt_size, nof_ntts, input );
input = (E*)malloc(sizeof(E) * batch_size);
initialize_input(ntt_size, nof_ntts, input);
E* output;
output = (E*) malloc(sizeof(E) * batch_size);
output = (E*)malloc(sizeof(E) * batch_size);

std::cout << "Running NTT with on-host data" << std::endl;
cudaStream_t stream;
cudaStreamCreate(&stream);
// Create a device context
auto ctx = device_context::get_default_device_context();
// the next line is valid only for CURVE_ID 1 (will add support for other curves soon)
S rou = S{ {0x53337857, 0x53422da9, 0xdbed349f, 0xac616632, 0x6d1e303, 0x27508aba, 0xa0ed063, 0x26125da1} };
S rou = S{{0x53337857, 0x53422da9, 0xdbed349f, 0xac616632, 0x6d1e303, 0x27508aba, 0xa0ed063, 0x26125da1}};
ntt::InitDomain(rou, ctx);
// Create an NTTConfig instance
ntt::NTTConfig<S> config=ntt::DefaultNTTConfig<S>();
ntt::NTTConfig<S> config = ntt::DefaultNTTConfig<S>();
config.batch_size = nof_ntts;
config.ctx.stream = stream;
auto begin0 = std::chrono::high_resolution_clock::now();
cudaError_t err = ntt::NTT<S, E>(input, ntt_size, ntt::NTTDir::kForward, config, output);
auto end0 = std::chrono::high_resolution_clock::now();
auto elapsed0 = std::chrono::duration_cast<std::chrono::nanoseconds>(end0 - begin0);
printf("On-device runtime: %.3f seconds\n", elapsed0.count() * 1e-9);
validate_output(ntt_size, nof_ntts, output );
validate_output(ntt_size, nof_ntts, output);
cudaStreamDestroy(stream);
free(input);
free(output);
Expand Down
25 changes: 25 additions & 0 deletions examples/c++/polynomial_multiplication/.devcontainer/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# Make sure NVIDIA Container Toolkit is installed on your host

# Use the specified base image
FROM nvidia/cuda:12.0.0-devel-ubuntu22.04

# Update and install dependencies
RUN apt-get update && apt-get install -y \
cmake \
curl \
build-essential \
git \
libboost-all-dev \
&& rm -rf /var/lib/apt/lists/*

# Clone Icicle from a GitHub repository
RUN git clone https://github.com/ingonyama-zk/icicle.git /icicle

# Set the working directory in the container
WORKDIR /icicle-example

# Specify the default command for the container
CMD ["/bin/bash"]



Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
{
"name": "Icicle Examples: polynomial multiplication",
"build": {
"dockerfile": "Dockerfile"
},
"runArgs": [
"--gpus",
"all"
],
"postCreateCommand": [
"nvidia-smi"
],
"customizations": {
"vscode": {
"extensions": [
"ms-vscode.cmake-tools",
"ms-python.python",
"ms-vscode.cpptools"
]
}
}
}
26 changes: 26 additions & 0 deletions examples/c++/polynomial_multiplication/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
cmake_minimum_required(VERSION 3.18)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CUDA_STANDARD 17)
set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
if (${CMAKE_VERSION} VERSION_LESS "3.24.0")
set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH})
else()
set(CMAKE_CUDA_ARCHITECTURES native) # on 3.24+, on earlier it is ignored, and the target is not passed
endif ()
project(icicle LANGUAGES CUDA CXX)

set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
set(CMAKE_CUDA_FLAGS_RELEASE "")
set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
# change the path to your Icicle location
include_directories("../../../icicle")
add_executable(
example
example.cu
)

find_library(NVML_LIBRARY nvidia-ml PATHS /usr/local/cuda-12.0/targets/x86_64-linux/lib/stubs/ )
target_link_libraries(example ${NVML_LIBRARY})
set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)

11 changes: 11 additions & 0 deletions examples/c++/polynomial_multiplication/compile.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#!/bin/bash

# Exit immediately on error
set -e

rm -rf build
mkdir -p build
cmake -S . -B build
cmake --build build


114 changes: 114 additions & 0 deletions examples/c++/polynomial_multiplication/example.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
#define CURVE_ID BLS12_381

#include <chrono>
#include <iostream>
#include <vector>

#include "curves/curve_config.cuh"
#include "appUtils/ntt/ntt.cu"
#include "appUtils/ntt/kernel_ntt.cu"
#include "utils/vec_ops.cu"
#include "utils/error_handler.cuh"
#include <memory>

typedef curve_config::scalar_t test_scalar;
typedef curve_config::scalar_t test_data;

void random_samples(test_data* res, uint32_t count)
{
for (int i = 0; i < count; i++)
res[i] = i < 1000 ? test_data::rand_host() : res[i - 1000];
}

void incremental_values(test_scalar* res, uint32_t count)
{
for (int i = 0; i < count; i++) {
res[i] = i ? res[i - 1] + test_scalar::one() * test_scalar::omega(4) : test_scalar::zero();
}
}

// calcaulting polynomial multiplication A*B via NTT,pointwise-multiplication and INTT
// (1) allocate A,B on CPU. Randomize first half, zero second half
// (2) allocate NttAGpu, NttBGpu on GPU
// (3) calc NTT for A and for B from cpu to GPU
// (4) multiply MulGpu = NttAGpu * NttBGpu (pointwise)
// (5) INTT MulGpu inplace

int main(int argc, char** argv)
{
cudaEvent_t start, stop;
float measured_time;

int NTT_LOG_SIZE = 23;
int NTT_SIZE = 1 << NTT_LOG_SIZE;

CHK_IF_RETURN(cudaFree(nullptr)); // init GPU context

// init domain
auto ntt_config = ntt::DefaultNTTConfig<test_scalar>();
ntt_config.ordering = ntt::Ordering::kNN; // TODO: use NR for forward and RN for backward
ntt_config.is_force_radix2 = (argc > 1) ? atoi(argv[1]) : false;

const char* ntt_alg_str = ntt_config.is_force_radix2 ? "Radix-2" : "Mixed-Radix";
std::cout << "Polynomial multiplication with " << ntt_alg_str << " NTT: ";

CHK_IF_RETURN(cudaEventCreate(&start));
CHK_IF_RETURN(cudaEventCreate(&stop));

const test_scalar basic_root = test_scalar::omega(NTT_LOG_SIZE);
ntt::InitDomain(basic_root, ntt_config.ctx);

// (1) cpu allocation
auto CpuA = std::make_unique<test_data[]>(NTT_SIZE);
auto CpuB = std::make_unique<test_data[]>(NTT_SIZE);
random_samples(CpuA.get(), NTT_SIZE >> 1); // second half zeros
random_samples(CpuB.get(), NTT_SIZE >> 1); // second half zeros

test_data *GpuA, *GpuB, *MulGpu;

auto benchmark = [&](bool print, int iterations = 1) {
// start recording
CHK_IF_RETURN(cudaEventRecord(start, ntt_config.ctx.stream));

for (int iter = 0; iter < iterations; ++iter) {
// (2) gpu input allocation
CHK_IF_RETURN(cudaMallocAsync(&GpuA, sizeof(test_data) * NTT_SIZE, ntt_config.ctx.stream));
CHK_IF_RETURN(cudaMallocAsync(&GpuB, sizeof(test_data) * NTT_SIZE, ntt_config.ctx.stream));

// (3) NTT for A,B from cpu to gpu
ntt_config.are_inputs_on_device = false;
ntt_config.are_outputs_on_device = true;
CHK_IF_RETURN(ntt::NTT(CpuA.get(), NTT_SIZE, ntt::NTTDir::kForward, ntt_config, GpuA));
CHK_IF_RETURN(ntt::NTT(CpuB.get(), NTT_SIZE, ntt::NTTDir::kForward, ntt_config, GpuB));

// (4) multiply A,B
CHK_IF_RETURN(cudaMallocAsync(&MulGpu, sizeof(test_data) * NTT_SIZE, ntt_config.ctx.stream));
CHK_IF_RETURN(
vec_ops::Mul(GpuA, GpuB, NTT_SIZE, true /*=is_on_device*/, false /*=is_montgomery*/, ntt_config.ctx, MulGpu));

// (5) INTT (in place)
ntt_config.are_inputs_on_device = true;
ntt_config.are_outputs_on_device = true;
CHK_IF_RETURN(ntt::NTT(MulGpu, NTT_SIZE, ntt::NTTDir::kInverse, ntt_config, MulGpu));

CHK_IF_RETURN(cudaFreeAsync(GpuA, ntt_config.ctx.stream));
CHK_IF_RETURN(cudaFreeAsync(GpuB, ntt_config.ctx.stream));
CHK_IF_RETURN(cudaFreeAsync(MulGpu, ntt_config.ctx.stream));
}

CHK_IF_RETURN(cudaEventRecord(stop, ntt_config.ctx.stream));
CHK_IF_RETURN(cudaStreamSynchronize(ntt_config.ctx.stream));
CHK_IF_RETURN(cudaEventElapsedTime(&measured_time, start, stop));

if (print) { std::cout << measured_time / iterations << " MS" << std::endl; }

return CHK_LAST();
};

benchmark(false); // warmup
benchmark(true, 20);

CHK_IF_RETURN(cudaStreamSynchronize(ntt_config.ctx.stream));

return 0;
}
3 changes: 3 additions & 0 deletions examples/c++/polynomial_multiplication/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/bin/bash
./build/example 1 # radix2
./build/example 0 # mixed-radix
2 changes: 1 addition & 1 deletion examples/rust/msm/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ struct Args {
lower_bound_log_size: u8,

/// Upper bound of MSM sizes to run for
#[arg(short, long, default_value_t = 23)]
#[arg(short, long, default_value_t = 22)]
upper_bound_log_size: u8,
}

Expand Down
1 change: 1 addition & 0 deletions icicle/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ if (NOT BUILD_TESTS)
primitives/projective.cu
appUtils/msm/msm.cu
appUtils/ntt/ntt.cu
appUtils/ntt/kernel_ntt.cu
${ICICLE_SOURCES}
)
set_target_properties(icicle PROPERTIES OUTPUT_NAME "ingo_${CURVE}")
Expand Down
6 changes: 6 additions & 0 deletions icicle/appUtils/ntt/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
build_verification:
mkdir -p work
nvcc -o work/test_verification -I. -I.. -I../.. -I../ntt tests/verification.cu -std=c++17

test_verification: build_verification
work/test_verification
Loading

0 comments on commit 3582df2

Please sign in to comment.