Feat/poseidon (#75)

Adds poseidon function to compute hashes over multiple preimages in parallel
ingonyama-zk · jeremyfelder · Jun 1, 2023 · May 29, 2023 · May 29, 2023 · May 30, 2023
commit 43f8c01afe34a8e53cb0c7ffc0de27a3dc80c5db
diff --git a/Cargo.toml b/Cargo.toml
@@ -27,6 +27,10 @@ ark-bls12-381 = "0.3.0"
 ark-bls12-377 = "0.3.0"
 ark-bn254 = "0.3.0"
 
+serde = { version = "1.0", features = ["derive"] }
+serde_derive = "1.0"
+serde_cbor = "0.11.2"
+
 rustacuda = "0.1"
 rustacuda_core = "0.1"
 rustacuda_derive = "0.1"

diff --git a/icicle/appUtils/poseidon/constants.cuh b/icicle/appUtils/poseidon/constants.cuh
@@ -0,0 +1,51 @@
+#pragma once
+
+#include <map>
+#include <stdexcept>
+#include <cassert>
+
+#include "constants/constants_2.h"
+#include "constants/constants_4.h"
+#include "constants/constants_8.h"
+#include "constants/constants_11.h"
+
+const std::map<uint, uint> ARITY_TO_ROUND_NUMBERS = {
+    {2, 55},
+    {4, 56},
+    {8, 57},
+    {11, 57},
+};
+
+// TO-DO: change to mapping
+const uint FULL_ROUNDS_DEFAULT = 4;
+
+static void get_round_numbers(const uint arity, uint * partial_rounds, uint * half_full_rounds) {
+    auto partial = ARITY_TO_ROUND_NUMBERS.find(arity);
+    assert(partial != ARITY_TO_ROUND_NUMBERS.end());
+
+    *partial_rounds = partial->second;
+    *half_full_rounds = FULL_ROUNDS_DEFAULT;
+}
+
+// TO-DO: for now, the constants are only generated in bls12_381
+template <typename S>
+S * load_constants(const uint arity) {
+    unsigned char * constants;
+    switch (arity) {
+        case 2:
+            constants = constants_2;
+            break;
+        case 4:
+            constants = constants_4;
+            break;
+        case 8:
+            constants = constants_8;
+            break;
+        case 11:
+            constants = constants_11;
+            break;
+        default:
+            throw std::invalid_argument( "unsupported arity" );
+    }
+    return reinterpret_cast< S * >(constants);
+}
diff --git a/icicle/appUtils/poseidon/constants/constants_11.h b/icicle/appUtils/poseidon/constants/constants_11.h
diff --git a/icicle/appUtils/poseidon/constants/constants_2.h b/icicle/appUtils/poseidon/constants/constants_2.h
diff --git a/icicle/appUtils/poseidon/constants/constants_4.h b/icicle/appUtils/poseidon/constants/constants_4.h
diff --git a/icicle/appUtils/poseidon/constants/constants_8.h b/icicle/appUtils/poseidon/constants/constants_8.h
diff --git a/icicle/appUtils/poseidon/poseidon.cu b/icicle/appUtils/poseidon/poseidon.cu
@@ -0,0 +1,271 @@
+#include "poseidon.cuh"
+
+template <typename S>
+__global__ void prepare_poseidon_states(S * inp, S * states, size_t number_of_states, S domain_tag, const PoseidonConfiguration<S> config) {
+    int idx = (blockIdx.x * blockDim.x) + threadIdx.x;
+    int state_number = idx / config.t;
+    if (state_number >= number_of_states) {
+        return;
+    }
+    int element_number = idx % config.t;
+
+    S prepared_element;
+
+    // Domain separation
+    if (element_number == 0) {
+        prepared_element = domain_tag;
+    } else {
+        prepared_element = inp[state_number * (config.t - 1) + element_number - 1];
+    }
+
+    // Add pre-round constant
+    prepared_element = prepared_element + config.round_constants[element_number];
+
+    // Store element in state
+    states[idx] = prepared_element;
+}
+
+template <typename S>
+__device__ __forceinline__ S sbox_alpha_five(S element) {
+    S result = S::sqr(element);
+    result = S::sqr(result);
+    return result * element;
+}
+
+template <typename S>
+__device__ S vecs_mul_matrix(S element, S * matrix, int element_number, int vec_number, int size, S * shared_states) {
+    shared_states[threadIdx.x] = element;
+    __syncthreads();
+
+    element = S::zero();
+    for (int i = 0; i < size; i++) {
+        element = element + (shared_states[vec_number * size + i] * matrix[i * size + element_number]);
+    }
+    __syncthreads();
+    return element;
+}
+
+template <typename S>
+__device__ S full_round(S element,
+                        size_t rc_offset,
+                        int local_state_number,
+                        int element_number,
+                        bool multiply_by_mds,
+                        bool add_round_constant,
+                        S * shared_states,
+                        const PoseidonConfiguration<S> config) {
+    element = sbox_alpha_five(element);
+    if (add_round_constant) {
+        element = element + config.round_constants[rc_offset + element_number];
+    }
+
+    // Multiply all the states by mds matrix
+    S * matrix = multiply_by_mds ? config.mds_matrix : config.non_sparse_matrix;
+    return vecs_mul_matrix(element, matrix, element_number, local_state_number, config.t, shared_states);
+}
+
+// Execute full rounds
+template <typename S>
+__global__ void full_rounds(S * states, size_t number_of_states, size_t rc_offset, bool first_half, const PoseidonConfiguration<S> config) {
+    extern __shared__ S shared_states[];
+
+    int idx = (blockIdx.x * blockDim.x) + threadIdx.x;
+    int state_number = idx / config.t;
+    if (state_number >= number_of_states) {
+        return;
+    }
+    int local_state_number = threadIdx.x / config.t;
+    int element_number = idx % config.t;
+
+    for (int i = 0; i < config.full_rounds_half - 1; i++) {
+        states[idx] = full_round(states[idx],
+                                 rc_offset,
+                                 local_state_number,
+                                 element_number,
+                                 true,
+                                 true,
+                                 shared_states,
+                                 config);
+        rc_offset += config.t;
+    }
+
+    states[idx] = full_round(states[idx],
+                             rc_offset,
+                             local_state_number,
+                             element_number,
+                             !first_half,
+                             first_half,
+                             shared_states,
+                             config);
+}
+
+template <typename S>
+__device__ S partial_round(S * state,
+                                  size_t rc_offset,
+                                  int round_number,
+                                  const PoseidonConfiguration<S> config) {
+    S element = state[0];
+    element = sbox_alpha_five(element);
+    element = element + config.round_constants[rc_offset];
+
+    S * sparse_matrix = &config.sparse_matrices[(config.t * 2 - 1) * round_number];
+
+    state[0] = element * sparse_matrix[0];
+    for (int i = 1; i < config.t; i++) {
+        state[0] = state[0] + (state[i] * sparse_matrix[i]);
+    }
+
+    for (int i = 1; i < config.t; i++) {
+        state[i] = state[i] + (element * sparse_matrix[config.t + i - 1]);
+    }
+}
+
+// Execute partial rounds
+template <typename S>
+__global__ void partial_rounds(S * states, size_t number_of_states, size_t rc_offset, const PoseidonConfiguration<S> config) {
+    int idx = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (idx >= number_of_states) {
+        return;
+    }
+
+    S * state = &states[idx * config.t];
+
+    for (int i = 0; i < config.partial_rounds; i++) {
+        partial_round(state, rc_offset, i, config);
+        rc_offset++;
+    }
+}
+
+// These function is just doing copy from the states to the output
+template <typename S>
+__global__ void get_hash_results(S * states, size_t number_of_states, S * out, int t) {
+    int idx = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (idx >= number_of_states) {
+        return;
+    }
+
+    out[idx] = states[idx * t + 1];
+}
+
+template <typename S>
+__host__ void Poseidon<S>::hash_blocks(const S * inp, size_t blocks, S * out, HashType hash_type) {
+    // Used in matrix multiplication
+
+    S * states, * inp_device;
+
+    // allocate memory for {blocks} states of {t} scalars each
+    cudaMalloc(&states, blocks * this->t * sizeof(S));
+
+    // Move input to cuda
+    cudaMalloc(&inp_device, blocks * (this->t - 1) * sizeof(S));
+    cudaMemcpy(inp_device, inp, blocks * (this->t - 1) * sizeof(S), cudaMemcpyHostToDevice);
+
+    size_t rc_offset = 0;
+
+    // The logic behind this is that 1 thread only works on 1 element
+    // We have {t} elements in each state, and {blocks} states total
+    int number_of_threads = (256 / this->t) * this->t;
+    int hashes_per_block = number_of_threads / this->t;
+    int total_number_of_threads = blocks * this->t;
+    int number_of_blocks = total_number_of_threads / number_of_threads +
+        static_cast<bool>(total_number_of_threads % number_of_threads);
+
+    // The partial rounds operates on the whole state, so we define
+    // the parallelism params for processing a single hash preimage per thread
+    int singlehash_block_size = 128;
+    int number_of_singlehash_blocks = blocks / singlehash_block_size + static_cast<bool>(blocks % singlehash_block_size);
+
+    // Pick the domain_tag accordinaly
+    S domain_tag;
+    switch (hash_type) {
+        case HashType::ConstInputLen:
+            domain_tag = this->const_input_no_pad_domain_tag;
+            break;
+
+        case HashType::MerkleTree:
+            domain_tag = this->tree_domain_tag;
+    }
+
+    #if !defined(__CUDA_ARCH__) && defined(DEBUG)
+    auto start_time = std::chrono::high_resolution_clock::now();
+    #endif
+
+    // Domain separation and adding pre-round constants
+    prepare_poseidon_states <<< number_of_blocks, number_of_threads >>> (inp_device, states, blocks, domain_tag, this->config);
+    rc_offset += this->t;
+    cudaFree(inp_device);
+
+    #if !defined(__CUDA_ARCH__) && defined(DEBUG)
+    cudaDeviceSynchronize();
+    std::cout << "Domain separation: " << rc_offset << std::endl;
+    print_buffer_from_cuda<S>(states, blocks * this->t);
+
+    auto end_time = std::chrono::high_resolution_clock::now();
+    auto elapsed_time = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time);
+    std::cout << "Elapsed time: " << elapsed_time.count() << " ms" << std::endl;
+    start_time = std::chrono::high_resolution_clock::now();
+    #endif
+
+    // execute half full rounds
+    full_rounds <<< number_of_blocks, number_of_threads, sizeof(S) * hashes_per_block * this->t >>> (states, blocks, rc_offset, true, this->config);
+    rc_offset += this->t * this->config.full_rounds_half;
+
+    #if !defined(__CUDA_ARCH__) && defined(DEBUG)
+    cudaDeviceSynchronize();
+    std::cout << "Full rounds 1. RCOFFSET: " << rc_offset << std::endl;
+    print_buffer_from_cuda<S>(states, blocks * this->t);
+
+    end_time = std::chrono::high_resolution_clock::now();
+    elapsed_time = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time);
+    std::cout << "Elapsed time: " << elapsed_time.count() << " ms" << std::endl;
+    start_time = std::chrono::high_resolution_clock::now();
+    #endif
+
+    // execute partial rounds
+    partial_rounds <<< number_of_singlehash_blocks, singlehash_block_size >>> (states, blocks, rc_offset, this->config);
+    rc_offset += this->config.partial_rounds;
+
+    #if !defined(__CUDA_ARCH__) && defined(DEBUG)
+    cudaDeviceSynchronize();
+    std::cout << "Partial rounds. RCOFFSET: " << rc_offset << std::endl;
+    print_buffer_from_cuda<S>(states, blocks * this->t);
+
+    end_time = std::chrono::high_resolution_clock::now();
+    elapsed_time = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time);
+    std::cout << "Elapsed time: " << elapsed_time.count() << " ms" << std::endl;
+    start_time = std::chrono::high_resolution_clock::now();
+    #endif
+
+    // execute half full rounds
+    full_rounds <<< number_of_blocks, number_of_threads, sizeof(S) * hashes_per_block * this->t >>> (states, blocks, rc_offset, false, this->config);
+
+    #if !defined(__CUDA_ARCH__) && defined(DEBUG)
+    cudaDeviceSynchronize();
+    std::cout << "Full rounds 2. RCOFFSET: " << rc_offset << std::endl;
+    print_buffer_from_cuda<S>(states, blocks * this->t);
+    end_time = std::chrono::high_resolution_clock::now();
+    elapsed_time = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time);
+    std::cout << "Elapsed time: " << elapsed_time.count() << " ms" << std::endl;
+    start_time = std::chrono::high_resolution_clock::now();
+    #endif
+
+    // get output
+    S * out_device;
+    cudaMalloc(&out_device, blocks * sizeof(S));
+    get_hash_results <<< number_of_singlehash_blocks, singlehash_block_size >>> (states, blocks, out_device, this->config.t);
+
+    #if !defined(__CUDA_ARCH__) && defined(DEBUG)
+    cudaDeviceSynchronize();
+    std::cout << "Get hash results" << std::endl;
+    end_time = std::chrono::high_resolution_clock::now();
+    elapsed_time = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time);
+    std::cout << "Elapsed time: " << elapsed_time.count() << " ms" << std::endl;
+    #endif
+    cudaMemcpy(out, out_device, blocks * sizeof(S), cudaMemcpyDeviceToHost);
+    cudaFree(out_device);
+    cudaFree(states);
+
+    #if !defined(__CUDA_ARCH__) && defined(DEBUG)
+    cudaDeviceReset();
+    #endif
+}