Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dev v2 #95

Merged
merged 6 commits into from
Jun 1, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
Feat/poseidon (#75)
Adds poseidon function to compute hashes over multiple preimages in parallel
  • Loading branch information
ChickenLover authored and jeremyfelder committed Jun 1, 2023
commit 43f8c01afe34a8e53cb0c7ffc0de27a3dc80c5db
4 changes: 4 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,10 @@ ark-bls12-381 = "0.3.0"
ark-bls12-377 = "0.3.0"
ark-bn254 = "0.3.0"

serde = { version = "1.0", features = ["derive"] }
serde_derive = "1.0"
serde_cbor = "0.11.2"

rustacuda = "0.1"
rustacuda_core = "0.1"
rustacuda_derive = "0.1"
Expand Down
51 changes: 51 additions & 0 deletions icicle/appUtils/poseidon/constants.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#pragma once

#include <map>
#include <stdexcept>
#include <cassert>

#include "constants/constants_2.h"
#include "constants/constants_4.h"
#include "constants/constants_8.h"
#include "constants/constants_11.h"

const std::map<uint, uint> ARITY_TO_ROUND_NUMBERS = {
{2, 55},
{4, 56},
{8, 57},
{11, 57},
};

// TO-DO: change to mapping
const uint FULL_ROUNDS_DEFAULT = 4;

static void get_round_numbers(const uint arity, uint * partial_rounds, uint * half_full_rounds) {
auto partial = ARITY_TO_ROUND_NUMBERS.find(arity);
assert(partial != ARITY_TO_ROUND_NUMBERS.end());

*partial_rounds = partial->second;
*half_full_rounds = FULL_ROUNDS_DEFAULT;
}

// TO-DO: for now, the constants are only generated in bls12_381
template <typename S>
S * load_constants(const uint arity) {
unsigned char * constants;
switch (arity) {
case 2:
constants = constants_2;
break;
case 4:
constants = constants_4;
break;
case 8:
constants = constants_8;
break;
case 11:
constants = constants_11;
break;
default:
throw std::invalid_argument( "unsupported arity" );
}
return reinterpret_cast< S * >(constants);
}
4,675 changes: 4,675 additions & 0 deletions icicle/appUtils/poseidon/constants/constants_11.h

Large diffs are not rendered by default.

995 changes: 995 additions & 0 deletions icicle/appUtils/poseidon/constants/constants_2.h

Large diffs are not rendered by default.

1,737 changes: 1,737 additions & 0 deletions icicle/appUtils/poseidon/constants/constants_4.h

Large diffs are not rendered by default.

3,363 changes: 3,363 additions & 0 deletions icicle/appUtils/poseidon/constants/constants_8.h

Large diffs are not rendered by default.

271 changes: 271 additions & 0 deletions icicle/appUtils/poseidon/poseidon.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,271 @@
#include "poseidon.cuh"

template <typename S>
__global__ void prepare_poseidon_states(S * inp, S * states, size_t number_of_states, S domain_tag, const PoseidonConfiguration<S> config) {
int idx = (blockIdx.x * blockDim.x) + threadIdx.x;
int state_number = idx / config.t;
if (state_number >= number_of_states) {
return;
}
int element_number = idx % config.t;

S prepared_element;

// Domain separation
if (element_number == 0) {
prepared_element = domain_tag;
} else {
prepared_element = inp[state_number * (config.t - 1) + element_number - 1];
}

// Add pre-round constant
prepared_element = prepared_element + config.round_constants[element_number];

// Store element in state
states[idx] = prepared_element;
}

template <typename S>
__device__ __forceinline__ S sbox_alpha_five(S element) {
S result = S::sqr(element);
result = S::sqr(result);
return result * element;
}

template <typename S>
__device__ S vecs_mul_matrix(S element, S * matrix, int element_number, int vec_number, int size, S * shared_states) {
shared_states[threadIdx.x] = element;
__syncthreads();

element = S::zero();
for (int i = 0; i < size; i++) {
element = element + (shared_states[vec_number * size + i] * matrix[i * size + element_number]);
}
__syncthreads();
return element;
}

template <typename S>
__device__ S full_round(S element,
size_t rc_offset,
int local_state_number,
int element_number,
bool multiply_by_mds,
bool add_round_constant,
S * shared_states,
const PoseidonConfiguration<S> config) {
element = sbox_alpha_five(element);
if (add_round_constant) {
element = element + config.round_constants[rc_offset + element_number];
}

// Multiply all the states by mds matrix
S * matrix = multiply_by_mds ? config.mds_matrix : config.non_sparse_matrix;
return vecs_mul_matrix(element, matrix, element_number, local_state_number, config.t, shared_states);
}

// Execute full rounds
template <typename S>
__global__ void full_rounds(S * states, size_t number_of_states, size_t rc_offset, bool first_half, const PoseidonConfiguration<S> config) {
extern __shared__ S shared_states[];

int idx = (blockIdx.x * blockDim.x) + threadIdx.x;
int state_number = idx / config.t;
if (state_number >= number_of_states) {
return;
}
int local_state_number = threadIdx.x / config.t;
int element_number = idx % config.t;

for (int i = 0; i < config.full_rounds_half - 1; i++) {
states[idx] = full_round(states[idx],
rc_offset,
local_state_number,
element_number,
true,
true,
shared_states,
config);
rc_offset += config.t;
}

states[idx] = full_round(states[idx],
rc_offset,
local_state_number,
element_number,
!first_half,
first_half,
shared_states,
config);
}

template <typename S>
__device__ S partial_round(S * state,
size_t rc_offset,
int round_number,
const PoseidonConfiguration<S> config) {
S element = state[0];
element = sbox_alpha_five(element);
element = element + config.round_constants[rc_offset];

S * sparse_matrix = &config.sparse_matrices[(config.t * 2 - 1) * round_number];

state[0] = element * sparse_matrix[0];
for (int i = 1; i < config.t; i++) {
state[0] = state[0] + (state[i] * sparse_matrix[i]);
}

for (int i = 1; i < config.t; i++) {
state[i] = state[i] + (element * sparse_matrix[config.t + i - 1]);
}
}

// Execute partial rounds
template <typename S>
__global__ void partial_rounds(S * states, size_t number_of_states, size_t rc_offset, const PoseidonConfiguration<S> config) {
int idx = (blockIdx.x * blockDim.x) + threadIdx.x;
if (idx >= number_of_states) {
return;
}

S * state = &states[idx * config.t];

for (int i = 0; i < config.partial_rounds; i++) {
partial_round(state, rc_offset, i, config);
rc_offset++;
}
}

// These function is just doing copy from the states to the output
template <typename S>
__global__ void get_hash_results(S * states, size_t number_of_states, S * out, int t) {
int idx = (blockIdx.x * blockDim.x) + threadIdx.x;
if (idx >= number_of_states) {
return;
}

out[idx] = states[idx * t + 1];
}

template <typename S>
__host__ void Poseidon<S>::hash_blocks(const S * inp, size_t blocks, S * out, HashType hash_type) {
// Used in matrix multiplication

S * states, * inp_device;

// allocate memory for {blocks} states of {t} scalars each
cudaMalloc(&states, blocks * this->t * sizeof(S));

// Move input to cuda
cudaMalloc(&inp_device, blocks * (this->t - 1) * sizeof(S));
cudaMemcpy(inp_device, inp, blocks * (this->t - 1) * sizeof(S), cudaMemcpyHostToDevice);

size_t rc_offset = 0;

// The logic behind this is that 1 thread only works on 1 element
// We have {t} elements in each state, and {blocks} states total
int number_of_threads = (256 / this->t) * this->t;
int hashes_per_block = number_of_threads / this->t;
int total_number_of_threads = blocks * this->t;
int number_of_blocks = total_number_of_threads / number_of_threads +
static_cast<bool>(total_number_of_threads % number_of_threads);

// The partial rounds operates on the whole state, so we define
// the parallelism params for processing a single hash preimage per thread
int singlehash_block_size = 128;
int number_of_singlehash_blocks = blocks / singlehash_block_size + static_cast<bool>(blocks % singlehash_block_size);

// Pick the domain_tag accordinaly
S domain_tag;
switch (hash_type) {
case HashType::ConstInputLen:
domain_tag = this->const_input_no_pad_domain_tag;
break;

case HashType::MerkleTree:
domain_tag = this->tree_domain_tag;
}

#if !defined(__CUDA_ARCH__) && defined(DEBUG)
auto start_time = std::chrono::high_resolution_clock::now();
#endif

// Domain separation and adding pre-round constants
prepare_poseidon_states <<< number_of_blocks, number_of_threads >>> (inp_device, states, blocks, domain_tag, this->config);
rc_offset += this->t;
cudaFree(inp_device);

#if !defined(__CUDA_ARCH__) && defined(DEBUG)
cudaDeviceSynchronize();
std::cout << "Domain separation: " << rc_offset << std::endl;
print_buffer_from_cuda<S>(states, blocks * this->t);

auto end_time = std::chrono::high_resolution_clock::now();
auto elapsed_time = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time);
std::cout << "Elapsed time: " << elapsed_time.count() << " ms" << std::endl;
start_time = std::chrono::high_resolution_clock::now();
#endif

// execute half full rounds
full_rounds <<< number_of_blocks, number_of_threads, sizeof(S) * hashes_per_block * this->t >>> (states, blocks, rc_offset, true, this->config);
rc_offset += this->t * this->config.full_rounds_half;

#if !defined(__CUDA_ARCH__) && defined(DEBUG)
cudaDeviceSynchronize();
std::cout << "Full rounds 1. RCOFFSET: " << rc_offset << std::endl;
print_buffer_from_cuda<S>(states, blocks * this->t);

end_time = std::chrono::high_resolution_clock::now();
elapsed_time = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time);
std::cout << "Elapsed time: " << elapsed_time.count() << " ms" << std::endl;
start_time = std::chrono::high_resolution_clock::now();
#endif

// execute partial rounds
partial_rounds <<< number_of_singlehash_blocks, singlehash_block_size >>> (states, blocks, rc_offset, this->config);
rc_offset += this->config.partial_rounds;

#if !defined(__CUDA_ARCH__) && defined(DEBUG)
cudaDeviceSynchronize();
std::cout << "Partial rounds. RCOFFSET: " << rc_offset << std::endl;
print_buffer_from_cuda<S>(states, blocks * this->t);

end_time = std::chrono::high_resolution_clock::now();
elapsed_time = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time);
std::cout << "Elapsed time: " << elapsed_time.count() << " ms" << std::endl;
start_time = std::chrono::high_resolution_clock::now();
#endif

// execute half full rounds
full_rounds <<< number_of_blocks, number_of_threads, sizeof(S) * hashes_per_block * this->t >>> (states, blocks, rc_offset, false, this->config);

#if !defined(__CUDA_ARCH__) && defined(DEBUG)
cudaDeviceSynchronize();
std::cout << "Full rounds 2. RCOFFSET: " << rc_offset << std::endl;
print_buffer_from_cuda<S>(states, blocks * this->t);
end_time = std::chrono::high_resolution_clock::now();
elapsed_time = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time);
std::cout << "Elapsed time: " << elapsed_time.count() << " ms" << std::endl;
start_time = std::chrono::high_resolution_clock::now();
#endif

// get output
S * out_device;
cudaMalloc(&out_device, blocks * sizeof(S));
get_hash_results <<< number_of_singlehash_blocks, singlehash_block_size >>> (states, blocks, out_device, this->config.t);

#if !defined(__CUDA_ARCH__) && defined(DEBUG)
cudaDeviceSynchronize();
std::cout << "Get hash results" << std::endl;
end_time = std::chrono::high_resolution_clock::now();
elapsed_time = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time);
std::cout << "Elapsed time: " << elapsed_time.count() << " ms" << std::endl;
#endif
cudaMemcpy(out, out_device, blocks * sizeof(S), cudaMemcpyDeviceToHost);
cudaFree(out_device);
cudaFree(states);

#if !defined(__CUDA_ARCH__) && defined(DEBUG)
cudaDeviceReset();
#endif
}
Loading