From 8ac7792d6cbe387b37e41a734b8150875b66eb35 Mon Sep 17 00:00:00 2001
From: Changdao Dong <dongchangdao@gmail.com>
Date: Sat, 19 Mar 2022 01:17:12 -0700
Subject: [PATCH 1/4] Issue #266

accepting shares
---
 algorithm.c            |   67 +-
 driver-opencl.c        |   18 +
 kernel/chainox_navi.cl |  160 +---
 kernel/simd_navi.cl    | 2071 ++++++++++++++++++++++++++++++++++++++++
 ocl.c                  |   18 +
 5 files changed, 2209 insertions(+), 125 deletions(-)
 create mode 100644 kernel/simd_navi.cl

diff --git a/algorithm.c b/algorithm.c
index 7b45436..91fc888 100644
--- a/algorithm.c
+++ b/algorithm.c
@@ -773,6 +773,71 @@ static cl_int queue_chainox_kernel(struct __clState *clState, struct _dev_blk_ct
   return status;
 }
 
+static cl_int queue_chainox_navi_kernel(struct __clState *clState, struct _dev_blk_ctx *blk, __maybe_unused cl_uint threads)
+{
+  cl_kernel *kernel;
+  unsigned int num;
+  cl_ulong le_target;
+  cl_int status = 0;
+
+  le_target = *(cl_ulong *)(blk->work->device_target + 24);
+  flip80(clState->cldata, blk->work->data);
+  status = clEnqueueWriteBuffer(clState->commandQueue, clState->CLbuffer0, true, 0, 80, clState->cldata, 0, NULL, NULL);
+
+  if (status != CL_SUCCESS) applog(LOG_ERR, "Error %d: write buffer failed.", status);
+  // blake - search
+  kernel = &clState->kernel;
+  num = 0;
+  CL_SET_ARG(clState->CLbuffer0);
+  CL_SET_ARG(clState->padbuffer8);
+  if (status != CL_SUCCESS) applog(LOG_ERR, "Error %d: search arg failed.", status);
+  // skein - search1
+  kernel = clState->extra_kernels;
+  CL_SET_ARG_0(clState->padbuffer8);
+  if (status != CL_SUCCESS) applog(LOG_ERR, "Error %d: search1 arg failed.", status);
+  // bmw - search2
+  CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
+  if (status != CL_SUCCESS) applog(LOG_ERR, "Error %d: search2 arg failed.", status);
+  // groestl - search3
+  CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
+  if (status != CL_SUCCESS) applog(LOG_ERR, "Error %d: search3 arg failed.", status);
+  // jh - search4
+  CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
+  if (status != CL_SUCCESS) applog(LOG_ERR, "Error %d: search4 arg failed.", status);
+  // luffa - search5
+  CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
+  if (status != CL_SUCCESS) applog(LOG_ERR, "Error %d: search5 arg failed.", status);
+  // keccak - search6
+  CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
+  if (status != CL_SUCCESS) applog(LOG_ERR, "Error %d: search6 arg failed.", status);
+  // cubehash - search7
+  CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
+  if (status != CL_SUCCESS) applog(LOG_ERR, "Error %d: search7 arg failed.", status);
+  // simd - search8
+  CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
+  CL_SET_ARG_N(1, clState->buffer1);
+  // simd - search9
+  CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
+  CL_SET_ARG_N(1, clState->buffer1);
+  CL_SET_ARG_N(2, clState->buffer2);
+  // simd - search10
+  CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
+  CL_SET_ARG_N(1, clState->buffer1);
+  CL_SET_ARG_N(2, clState->buffer2);
+  if (status != CL_SUCCESS) applog(LOG_ERR, "Error %d: search8 arg failed.", status);
+  // shavite - search11
+  CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
+  if (status != CL_SUCCESS) applog(LOG_ERR, "Error %d: search9 arg failed.", status);
+  // echo - search12
+  num = 0;
+  CL_NEXTKERNEL_SET_ARG(clState->padbuffer8);
+  CL_SET_ARG(clState->outputBuffer);
+  CL_SET_ARG(le_target);
+  if (status != CL_SUCCESS) applog(LOG_ERR, "Error %d: search10 arg failed.", status);
+
+  return status;
+}
+
 static cl_int queue_allium_kernel(struct __clState *clState, struct _dev_blk_ctx *blk, __maybe_unused cl_uint threads)
 {
   cl_kernel *kernel;
@@ -2622,7 +2687,7 @@ static algorithm_settings_t algos[] = {
 
   { "darkcoin-mod", ALGO_X11, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 10, 8 * 16 * 4194304, 0, darkcoin_regenhash, NULL, NULL, queue_darkcoin_mod_kernel, gen_hash, append_x11_compiler_options },
   { "chainox", ALGO_0X10, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 10, 8 * 16 * 4194304, 0, chainox_regenhash, NULL, NULL, queue_chainox_kernel, gen_hash, append_x11_compiler_options },
-  { "chainox_navi", ALGO_0X10_NAVI, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 10, 8 * 16 * 4194304, 0, chainox_regenhash, NULL, NULL, queue_chainox_kernel, gen_hash, append_x11_compiler_options },
+  { "chainox_navi", ALGO_0X10_NAVI, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 12, 8 * 16 * 4194304, 0, chainox_regenhash, NULL, NULL, queue_chainox_navi_kernel, gen_hash, append_x11_compiler_options },
 
   { "sibcoin-mod", ALGO_X11, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 11, 2 * 16 * 4194304, 0, sibcoin_regenhash, NULL, NULL, queue_sibcoin_mod_kernel, gen_hash, append_x11_compiler_options },
   
diff --git a/driver-opencl.c b/driver-opencl.c
index 14eb0e5..213d27b 100644
--- a/driver-opencl.c
+++ b/driver-opencl.c
@@ -1603,6 +1603,24 @@ if (gpu->algorithm.type != ALGO_MTP && gpu->algorithm.type != ALGO_YESCRYPT_NAVI
       globalOffset2[0] = (*p_global_work_offset) * 4;
       status = clEnqueueNDRangeKernel(clState->commandQueue, clState->extra_kernels[i], 1, globalOffset2,
         globalThreads2, localThreads2, 0, NULL, NULL);
+    } else if (gpu->algorithm.type == ALGO_0X10_NAVI && i == 7) {
+      size_t globalThreads2[1];
+      size_t localThreads2[1];
+      size_t globalOffset2[1];
+      globalThreads2[0] = globalThreads[0] * 8;
+      localThreads2[0] = 128;
+      globalOffset2[0] = (*p_global_work_offset) * 4;
+      status = clEnqueueNDRangeKernel(clState->commandQueue, clState->extra_kernels[i], 1, globalOffset2,
+        globalThreads2, localThreads2, 0, NULL, NULL);
+    } else if (gpu->algorithm.type == ALGO_0X10_NAVI && (i == 8 || i == 9)) {
+      size_t globalThreads2[1];
+      size_t localThreads2[1];
+      size_t globalOffset2[1];
+      globalThreads2[0] = globalThreads[0];
+      localThreads2[0] = 128;
+      globalOffset2[0] = (*p_global_work_offset) * 4;
+      status = clEnqueueNDRangeKernel(clState->commandQueue, clState->extra_kernels[i], 1, globalOffset2,
+        globalThreads2, localThreads2, 0, NULL, NULL);
     } else if (gpu->algorithm.type == ALGO_0X10 && i== 2 && clState->prebuilt) {
        size_t globalThreads2[1];
       size_t localThreads2[1];
diff --git a/kernel/chainox_navi.cl b/kernel/chainox_navi.cl
index dd6062d..54f4b18 100644
--- a/kernel/chainox_navi.cl
+++ b/kernel/chainox_navi.cl
@@ -98,7 +98,7 @@ typedef int sph_s32;
 #include "luffa.cl"
 #include "cubehash.cl"
 #include "shavite.cl"
-#include "simd.cl"
+#include "simd_navi.cl"
 #include "echo.cl"
 
 #define SWAP4(x) as_uint(as_uchar4(x).wzyx)
@@ -783,137 +783,49 @@ __kernel void search7(__global hash_t* hashes)
   barrier(CLK_GLOBAL_MEM_FENCE);
 }
 
-__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
-__kernel void search8(__global hash_t* hashes)
+__attribute__((reqd_work_group_size(128, 1, 1)))
+__kernel void search8(__global uint* g_hash, __global uint4 *g_temp4)
 {
   uint gid = get_global_id(0);
-  __global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
+  int threadBloc = (gid-get_global_offset(0)) / 8;
+  int hashPosition = threadBloc * 16;
+  __global uint *inpHash = &g_hash[hashPosition];
 
-  // simd
-  s32 q[256];
-  unsigned char x[128];
-  for(unsigned int i = 0; i < 64; i++)
-  x[i] = hash->h1[i];
-  for(unsigned int i = 64; i < 128; i++)
-  x[i] = 0;
-
-  u32 A0 = C32(0x0BA16B95), A1 = C32(0x72F999AD), A2 = C32(0x9FECC2AE), A3 = C32(0xBA3264FC), A4 = C32(0x5E894929), A5 = C32(0x8E9F30E5), A6 = C32(0x2F1DAA37), A7 = C32(0xF0F2C558);
-  u32 B0 = C32(0xAC506643), B1 = C32(0xA90635A5), B2 = C32(0xE25B878B), B3 = C32(0xAAB7878F), B4 = C32(0x88817F7A), B5 = C32(0x0A02892B), B6 = C32(0x559A7550), B7 = C32(0x598F657E);
-  u32 C0 = C32(0x7EEF60A1), C1 = C32(0x6B70E3E8), C2 = C32(0x9C1714D1), C3 = C32(0xB958E2A8), C4 = C32(0xAB02675E), C5 = C32(0xED1C014F), C6 = C32(0xCD8D65BB), C7 = C32(0xFDB7A257);
-  u32 D0 = C32(0x09254899), D1 = C32(0xD699C7BC), D2 = C32(0x9019B6DC), D3 = C32(0x2B9022E4), D4 = C32(0x8FA14956), D5 = C32(0x21BF9BD3), D6 = C32(0xB94D0943), D7 = C32(0x6FFDDC22);
-
-  FFT256(0, 1, 0, ll1);
-  for (int i = 0; i < 256; i ++)
-  {
-  s32 tq;
+  // Read hash per 8 threads
+  uint Hash[2];
+  int ndx = get_local_id(0) & 7;
+  Hash[0] = inpHash[ndx];
+  Hash[1] = inpHash[ndx + 8];
 
-  tq = q[i] + yoff_b_n[i];
-  tq = REDS2(tq);
-  tq = REDS1(tq);
-  tq = REDS1(tq);
-  q[i] = (tq <= 128 ? tq : tq - 257);
-  }
+  // Puffer für expandierte Nachricht
+  __global uint4 *temp4 = &g_temp4[hashPosition * 4];
 
-  A0 ^= hash->h4[0];
-  A1 ^= hash->h4[1];
-  A2 ^= hash->h4[2];
-  A3 ^= hash->h4[3];
-  A4 ^= hash->h4[4];
-  A5 ^= hash->h4[5];
-  A6 ^= hash->h4[6];
-  A7 ^= hash->h4[7];
-  B0 ^= hash->h4[8];
-  B1 ^= hash->h4[9];
-  B2 ^= hash->h4[10];
-  B3 ^= hash->h4[11];
-  B4 ^= hash->h4[12];
-  B5 ^= hash->h4[13];
-  B6 ^= hash->h4[14];
-  B7 ^= hash->h4[15];
-
-  ONE_ROUND_BIG(0_, 0,  3, 23, 17, 27);
-  ONE_ROUND_BIG(1_, 1, 28, 19, 22,  7);
-  ONE_ROUND_BIG(2_, 2, 29,  9, 15,  5);
-  ONE_ROUND_BIG(3_, 3,  4, 13, 10, 25);
-
-  STEP_BIG(
-  C32(0x0BA16B95), C32(0x72F999AD), C32(0x9FECC2AE), C32(0xBA3264FC),
-  C32(0x5E894929), C32(0x8E9F30E5), C32(0x2F1DAA37), C32(0xF0F2C558),
-  IF,  4, 13, PP8_4_);
-
-  STEP_BIG(
-  C32(0xAC506643), C32(0xA90635A5), C32(0xE25B878B), C32(0xAAB7878F),
-  C32(0x88817F7A), C32(0x0A02892B), C32(0x559A7550), C32(0x598F657E),
-  IF, 13, 10, PP8_5_);
-
-  STEP_BIG(
-  C32(0x7EEF60A1), C32(0x6B70E3E8), C32(0x9C1714D1), C32(0xB958E2A8),
-  C32(0xAB02675E), C32(0xED1C014F), C32(0xCD8D65BB), C32(0xFDB7A257),
-  IF, 10, 25, PP8_6_);
-
-  STEP_BIG(
-  C32(0x09254899), C32(0xD699C7BC), C32(0x9019B6DC), C32(0x2B9022E4),
-  C32(0x8FA14956), C32(0x21BF9BD3), C32(0xB94D0943), C32(0x6FFDDC22),
-  IF, 25,  4, PP8_0_);
-
-  u32 COPY_A0 = A0, COPY_A1 = A1, COPY_A2 = A2, COPY_A3 = A3, COPY_A4 = A4, COPY_A5 = A5, COPY_A6 = A6, COPY_A7 = A7;
-  u32 COPY_B0 = B0, COPY_B1 = B1, COPY_B2 = B2, COPY_B3 = B3, COPY_B4 = B4, COPY_B5 = B5, COPY_B6 = B6, COPY_B7 = B7;
-  u32 COPY_C0 = C0, COPY_C1 = C1, COPY_C2 = C2, COPY_C3 = C3, COPY_C4 = C4, COPY_C5 = C5, COPY_C6 = C6, COPY_C7 = C7;
-  u32 COPY_D0 = D0, COPY_D1 = D1, COPY_D2 = D2, COPY_D3 = D3, COPY_D4 = D4, COPY_D5 = D5, COPY_D6 = D6, COPY_D7 = D7;
-
-  #define q SIMD_Q
-
-  A0 ^= 0x200;
-
-  ONE_ROUND_BIG(0_, 0,  3, 23, 17, 27);
-  ONE_ROUND_BIG(1_, 1, 28, 19, 22,  7);
-  ONE_ROUND_BIG(2_, 2, 29,  9, 15,  5);
-  ONE_ROUND_BIG(3_, 3,  4, 13, 10, 25);
-
-  STEP_BIG(
-  COPY_A0, COPY_A1, COPY_A2, COPY_A3,
-  COPY_A4, COPY_A5, COPY_A6, COPY_A7,
-  IF,  4, 13, PP8_4_);
-
-  STEP_BIG(
-  COPY_B0, COPY_B1, COPY_B2, COPY_B3,
-  COPY_B4, COPY_B5, COPY_B6, COPY_B7,
-  IF, 13, 10, PP8_5_);
-
-  STEP_BIG(
-  COPY_C0, COPY_C1, COPY_C2, COPY_C3,
-  COPY_C4, COPY_C5, COPY_C6, COPY_C7,
-  IF, 10, 25, PP8_6_);
-
-  STEP_BIG(
-  COPY_D0, COPY_D1, COPY_D2, COPY_D3,
-  COPY_D4, COPY_D5, COPY_D6, COPY_D7,
-  IF, 25,  4, PP8_0_);
-
-  #undef q
-
-  hash->h4[0] = A0;
-  hash->h4[1] = A1;
-  hash->h4[2] = A2;
-  hash->h4[3] = A3;
-  hash->h4[4] = A4;
-  hash->h4[5] = A5;
-  hash->h4[6] = A6;
-  hash->h4[7] = A7;
-  hash->h4[8] = B0;
-  hash->h4[9] = B1;
-  hash->h4[10] = B2;
-  hash->h4[11] = B3;
-  hash->h4[12] = B4;
-  hash->h4[13] = B5;
-  hash->h4[14] = B6;
-  hash->h4[15] = B7;
+  Expansion(Hash, temp4);
+}
 
-  barrier(CLK_GLOBAL_MEM_FENCE);
+__attribute__((reqd_work_group_size(128, 1, 1)))
+__kernel void search9(__global uint* g_hash, __global uint4 *g_fft4, __global uint *g_state)
+{
+  uint gid = get_global_id(0);
+  uint thread = gid - get_global_offset(0);
+
+  __global uint *Hash = &g_hash[thread * 16];
+  Compression1(Hash, thread, g_fft4, g_state);
+  Compression2(thread, g_fft4, g_state);
+}
+
+__attribute__((reqd_work_group_size(128, 1, 1)))
+__kernel void search10(__global uint* g_hash, __global uint4 *g_fft4, __global uint *g_state)
+{
+  uint gid = get_global_id(0);
+  uint thread = gid - get_global_offset(0);
+
+  __global uint *Hash = &g_hash[thread * 16];
+  Final(Hash, thread, g_fft4, g_state);
 }
 
 __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
-__kernel void search9(__global hash_t* hashes)
+__kernel void search11(__global hash_t* hashes)
 {
   uint gid = get_global_id(0);
   __global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
@@ -995,7 +907,7 @@ __kernel void search9(__global hash_t* hashes)
 
 
 __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
-__kernel void search10(__global hash_t* hashes, __global uint* output, const ulong target)
+__kernel void search12(__global hash_t* hashes, __global uint* output, const ulong target)
 {
   uint gid = get_global_id(0);
   __global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
diff --git a/kernel/simd_navi.cl b/kernel/simd_navi.cl
new file mode 100644
index 0000000..78c97f7
--- /dev/null
+++ b/kernel/simd_navi.cl
@@ -0,0 +1,2071 @@
+
+/*inline uint __byte_perm(uint a, uint b, uint p) {
+    uint ret;
+    __asm__("v_perm_b32 %0, %1, %2, %3;" : "=&v"(ret) : "v"(b), "v"(a), "s"(p));
+    return ret;
+}*/
+
+#define __shfl_lane_8(ret, v, u) { \
+    if ((u & 0x7) == 0) { \
+      __asm ( \
+          "v_mov_b32_dpp  %[d0], %[a0] dpp8:[0,0,0,0,0,0,0,0]\n" \
+          : [d0] "=&v" (ret) \
+          : [a0] "v" (v)); \
+    } \
+    if ((u & 0x7) == 1) { \
+      __asm ( \
+          "v_mov_b32_dpp  %[d0], %[a0] dpp8:[1,1,1,1,1,1,1,1]\n" \
+          : [d0] "=&v" (ret) \
+          : [a0] "v" (v)); \
+    } \
+    if ((u & 0x7) == 2) { \
+      __asm ( \
+          "v_mov_b32_dpp  %[d0], %[a0] dpp8:[2,2,2,2,2,2,2,2]\n" \
+          : [d0] "=&v" (ret) \
+          : [a0] "v" (v)); \
+    } \
+    if ((u & 0x7) ==3) { \
+      __asm ( \
+          "v_mov_b32_dpp  %[d0], %[a0] dpp8:[3,3,3,3,3,3,3,3]\n" \
+          : [d0] "=&v" (ret) \
+          : [a0] "v" (v)); \
+    } \
+    if ((u & 0x7) ==4) { \
+      __asm ( \
+          "v_mov_b32_dpp  %[d0], %[a0] dpp8:[4,4,4,4,4,4,4,4]\n" \
+          : [d0] "=&v" (ret) \
+          : [a0] "v" (v)); \
+    } \
+    if ((u & 0x7) ==5) { \
+      __asm ( \
+          "v_mov_b32_dpp  %[d0], %[a0] dpp8:[5,5,5,5,5,5,5,5]\n" \
+          : [d0] "=&v" (ret) \
+          : [a0] "v" (v)); \
+    } \
+    if ((u & 0x7) ==6) { \
+      __asm ( \
+          "v_mov_b32_dpp  %[d0], %[a0] dpp8:[6,6,6,6,6,6,6,6]\n" \
+          : [d0] "=&v" (ret) \
+          : [a0] "v" (v)); \
+    } \
+    if ((u & 0x7) ==7) { \
+      __asm ( \
+          "v_mov_b32_dpp  %[d0], %[a0] dpp8:[7,7,7,7,7,7,7,7]\n" \
+          : [d0] "=&v" (ret) \
+          : [a0] "v" (v)); \
+    } \
+}
+ 
+#define __shfl_right1(ret, v) { \
+    __asm ( \
+          "v_mov_b32_dpp  %[d0], %[a0] dpp8:[7,0,1,2,3,4,5,6]\n" \
+          : [d0] "=&v" (ret) \
+          : [a0] "v" (v)); \
+}
+
+#define __shfl_left1(ret, v) { \
+    __asm ( \
+          "v_mov_b32_dpp  %[d0], %[a0] dpp8:[1,2,3,4,5,6,7,0]\n" \
+          : [d0] "=&v" (ret) \
+          : [a0] "v" (v)); \
+}
+
+#define __shfl_xor1(ret, v) { \
+    __asm ( \
+          "v_mov_b32_dpp  %[d0], %[a0] dpp8:[1,0,3,2,5,4,7,6]\n" \
+          : [d0] "=&v" (ret) \
+          : [a0] "v" (v)); \
+}
+
+#define __shfl_left4(ret, v) { \
+    __asm ( \
+          "v_mov_b32_dpp  %[d0], %[a0] dpp8:[4,5,6,7,0,1,2,3]\n" \
+          : [d0] "=&v" (ret) \
+          : [a0] "v" (v)); \
+}
+
+#define __shfl_cperm0(ret, v) { \
+    __asm ( \
+          "v_mov_b32_dpp  %[d0], %[a0] dpp8:[2, 3, 6, 7, 0, 1, 4, 5]\n" \
+          : [d0] "=&v" (ret) \
+          : [a0] "v" (v)); \
+}
+
+#define __shfl_cperm1(ret, v) { \
+    __asm ( \
+          "v_mov_b32_dpp  %[d0], %[a0] dpp8:[6, 7, 2, 3, 4, 5, 0, 1]\n" \
+          : [d0] "=&v" (ret) \
+          : [a0] "v" (v)); \
+}
+
+#define __shfl_cperm2(ret, v) { \
+    __asm ( \
+          "v_mov_b32_dpp  %[d0], %[a0] dpp8:[7, 6, 5, 4, 3, 2, 1, 0]\n" \
+          : [d0] "=&v" (ret) \
+          : [a0] "v" (v)); \
+}
+
+#define __shfl_cperm3(ret, v) { \
+    __asm ( \
+          "v_mov_b32_dpp  %[d0], %[a0] dpp8:[1, 0, 3, 2, 5, 4, 7, 6]\n" \
+          : [d0] "=&v" (ret) \
+          : [a0] "v" (v)); \
+}
+
+#define __shfl_cperm4(ret, v) { \
+    __asm ( \
+          "v_mov_b32_dpp  %[d0], %[a0] dpp8:[0, 1, 4, 5, 6, 7, 2, 3]\n" \
+          : [d0] "=&v" (ret) \
+          : [a0] "v" (v)); \
+}
+
+#define __shfl_cperm5(ret, v) { \
+    __asm ( \
+          "v_mov_b32_dpp  %[d0], %[a0] dpp8:[6, 7, 2, 3, 0, 1, 4, 5]\n" \
+          : [d0] "=&v" (ret) \
+          : [a0] "v" (v)); \
+}
+
+#define __shfl_cperm6(ret, v) { \
+    __asm ( \
+          "v_mov_b32_dpp  %[d0], %[a0] dpp8:[6, 7, 0, 1, 4, 5, 2, 3]\n" \
+          : [d0] "=&v" (ret) \
+          : [a0] "v" (v)); \
+}
+
+#define __shfl_cperm7(ret, v) { \
+    __asm ( \
+          "v_mov_b32_dpp  %[d0], %[a0] dpp8:[4, 5, 2, 3, 6, 7, 0, 1]\n" \
+          : [d0] "=&v" (ret) \
+          : [a0] "v" (v)); \
+}
+
+#define __shfl_half0(ret, v) { \
+    __asm ( \
+          "v_mov_b32_dpp  %[d0], %[a0] dpp8:[0,1,2,3,0,1,2,3]\n" \
+          : [d0] "=&v" (ret) \
+          : [a0] "v" (v)); \
+}
+
+#define __shfl_half1(ret, v) { \
+    __asm ( \
+          "v_mov_b32_dpp  %[d0], %[a0] dpp8:[4,5,6,7,4,5,6,7]\n" \
+          : [d0] "=&v" (ret) \
+          : [a0] "v" (v)); \
+}
+
+#define __shfl_half2(ret, v) { \
+    __asm ( \
+          "v_mov_b32_dpp  %[d0], %[a0] dpp8:[0,1,0,1,4,5,4,5]\n" \
+          : [d0] "=&v" (ret) \
+          : [a0] "v" (v)); \
+}
+
+#define __shfl_half3(ret, v) { \
+    __asm ( \
+          "v_mov_b32_dpp  %[d0], %[a0] dpp8:[2,3,2,3,6,7,6,7]\n" \
+          : [d0] "=&v" (ret) \
+          : [a0] "v" (v)); \
+}
+
+#define __shfl_half4(ret, v) { \
+    __asm ( \
+          "v_mov_b32_dpp  %[d0], %[a0] dpp8:[0,0,2,2,4,4,6,6]\n" \
+          : [d0] "=&v" (ret) \
+          : [a0] "v" (v)); \
+}
+
+#define __shfl_half5(ret, v) { \
+    __asm ( \
+          "v_mov_b32_dpp  %[d0], %[a0] dpp8:[1,1,3,3,5,5,7,7]\n" \
+          : [d0] "=&v" (ret) \
+          : [a0] "v" (v)); \
+}
+
+
+__constant uint c_IV_512[32] = {
+     0x0ba16b95, 0x72f999ad, 0x9fecc2ae, 0xba3264fc, 0x5e894929, 0x8e9f30e5, 0x2f1daa37, 0xf0f2c558,
+     0xac506643, 0xa90635a5, 0xe25b878b, 0xaab7878f, 0x88817f7a, 0x0a02892b, 0x559a7550, 0x598f657e,
+     0x7eef60a1, 0x6b70e3e8, 0x9c1714d1, 0xb958e2a8, 0xab02675e, 0xed1c014f, 0xcd8d65bb, 0xfdb7a257,
+     0x09254899, 0xd699c7bc, 0x9019b6dc, 0x2b9022e4, 0x8fa14956, 0x21bf9bd3, 0xb94d0943, 0x6ffddc22
+ };
+ 
+__constant short c_FFT128_8_16_Twiddle[128] = {
+     1,   1,   1,   1,   1,    1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
+     1,  60,   2, 120,   4,  -17,   8, -34,  16, -68,  32, 121,  64, -15, 128, -30,
+     1,  46,  60, -67,   2,   92, 120, 123,   4, -73, -17, -11,   8, 111, -34, -22,
+     1, -67, 120, -73,   8,  -22, -68, -70,  64,  81, -30, -46,  -2,-123,  17,-111,
+     1,-118,  46, -31,  60,  116, -67, -61,   2,  21,  92, -62, 120, -25, 123,-122,
+     1, 116,  92,-122, -17,   84, -22,  18,  32, 114, 117, -49, -30, 118,  67,  62,
+     1, -31, -67,  21, 120, -122, -73, -50,   8,   9, -22, -89, -68,  52, -70, 114,
+     1, -61, 123, -50, -34,   18, -70, -99, 128, -98,  67,  25,  17,  -9,  35, -79
+ };
+ 
+
+ __constant short c_FFT256_2_128_Twiddle[128] = {
+       1,  41,-118,  45,  46,  87, -31,  14,
+      60,-110, 116,-127, -67,  80, -61,  69,
+       2,  82,  21,  90,  92, -83, -62,  28,
+     120,  37, -25,   3, 123, -97,-122,-119,
+       4, -93,  42, -77, -73,  91,-124,  56,
+     -17,  74, -50,   6, -11,  63,  13,  19,
+       8,  71,  84, 103, 111, -75,   9, 112,
+     -34,-109,-100,  12, -22, 126,  26,  38,
+      16,-115, -89, -51, -35, 107,  18, -33,
+     -68,  39,  57,  24, -44,  -5,  52,  76,
+      32,  27,  79,-102, -70, -43,  36, -66,
+     121,  78, 114,  48, -88, -10, 104,-105,
+      64,  54, -99,  53, 117, -86,  72, 125,
+     -15,-101, -29,  96,  81, -20, -49,  47,
+     128, 108,  59, 106, -23,  85,-113,  -7,
+     -30,  55, -58, -65, -95, -40, -98,  94
+ };
+
+/********************* Message expansion ************************/
+
+/*
+ * Reduce modulo 257; result is in [-127; 383]
+ * REDUCE(x) := (x&255) - (x>>8)
+ */
+#define REDUCE(x) \
+    (((x)&255) - ((x)>>8))
+
+/*
+ * Reduce from [-127; 383] to [-128; 128]
+ * EXTRA_REDUCE_S(x) := x<=128 ? x : x-257
+ */
+#define EXTRA_REDUCE_S(x) \
+    ((x)<=128 ? (x) : (x)-257)
+
+/*
+ * Reduce modulo 257; result is in [-128; 128]
+ */
+#define REDUCE_FULL_S(x) \
+    EXTRA_REDUCE_S(REDUCE(x))
+
+// Parallelization:
+//
+// FFT_8  wird 2 times 8-fach parallel ausgeführt (in FFT_64)
+//        and  1 time 16-fach parallel (in FFT_128_full)
+//
+// STEP8_IF and STEP8_MAJ beinhalten je 2x 8-fach parallel Operations
+
+/**
+ * FFT_8 using w=4 as 8th root of unity
+ * Unrolled decimation in frequency (DIF) radix-2 NTT.
+ * Output data is in revbin_permuted order.
+ */
+
+void inline FFT_8(int *y, int stripe)
+{
+#define X(i) y[stripe*i]
+
+#define DO_REDUCE(i) \
+    X(i) = REDUCE(X(i))
+
+#define DO_REDUCE_FULL_S(i) \
+do { \
+    X(i) = REDUCE(X(i)); \
+    X(i) = EXTRA_REDUCE_S(X(i)); \
+} while(0)
+
+#define BUTTERFLY(i,j,n) \
+do { \
+    int u= X(i); \
+    int v= X(j); \
+    X(i) = u+v; \
+    X(j) = (u-v) << (2*n); \
+} while(0)
+
+    BUTTERFLY(0, 4, 0);
+    BUTTERFLY(1, 5, 1);
+    BUTTERFLY(2, 6, 2);
+    BUTTERFLY(3, 7, 3);
+
+    DO_REDUCE(6);
+    DO_REDUCE(7);
+
+    BUTTERFLY(0, 2, 0);
+    BUTTERFLY(4, 6, 0);
+    BUTTERFLY(1, 3, 2);
+    BUTTERFLY(5, 7, 2);
+
+    DO_REDUCE(7);
+
+    BUTTERFLY(0, 1, 0);
+    BUTTERFLY(2, 3, 0);
+    BUTTERFLY(4, 5, 0);
+    BUTTERFLY(6, 7, 0);
+
+    DO_REDUCE_FULL_S(0);
+    DO_REDUCE_FULL_S(1);
+    DO_REDUCE_FULL_S(2);
+    DO_REDUCE_FULL_S(3);
+    DO_REDUCE_FULL_S(4);
+    DO_REDUCE_FULL_S(5);
+    DO_REDUCE_FULL_S(6);
+    DO_REDUCE_FULL_S(7);
+
+#undef X
+#undef DO_REDUCE
+#undef DO_REDUCE_FULL_S
+#undef BUTTERFLY
+}
+
+/**
+ * FFT_16 using w=2 as 16th root of unity
+ * Unrolled decimation in frequency (DIF) radix-2 NTT.
+ * Output data is in revbin_permuted order.
+ */
+
+void inline FFT_16(int *y)
+{
+#define DO_REDUCE_FULL_S(i) \
+    do { \
+        y[i] = REDUCE(y[i]); \
+        y[i] = EXTRA_REDUCE_S(y[i]); \
+    } while(0)
+
+    int u,v;
+
+    // BUTTERFLY(0, 8, 0);
+    // BUTTERFLY(1, 9, 1);
+    // BUTTERFLY(2, 10, 2);
+    // BUTTERFLY(3, 11, 3);
+    // BUTTERFLY(4, 12, 4);
+    // BUTTERFLY(5, 13, 5);
+    // BUTTERFLY(6, 14, 6);
+    // BUTTERFLY(7, 15, 7);
+    {
+        u = y[0]; // 0..7
+        v = y[1]; // 8..15
+        y[0] = u+v;
+        y[1] = (u-v) << (get_local_id(0)&7);
+    }
+
+    // DO_REDUCE(11);
+    // DO_REDUCE(12);
+    // DO_REDUCE(13);
+    // DO_REDUCE(14);
+    // DO_REDUCE(15);
+    if ((get_local_id(0)&7) >=3) y[1] = REDUCE(y[1]);  // 11...15
+
+    // BUTTERFLY( 0, 4, 0);
+    // BUTTERFLY( 1, 5, 2);
+    // BUTTERFLY( 2, 6, 4);
+    // BUTTERFLY( 3, 7, 6);
+    {
+        __shfl_half0(u, (int)y[0]); // 0,1,2,3  0,1,2,3
+        __shfl_half1(v, (int)y[0]); // 4,5,6,7  4,5,6,7
+        y[0] = ((get_local_id(0)&7) < 4) ? (u+v) : ((u-v) << (2*(get_local_id(0)&3)));
+    }
+
+    // BUTTERFLY( 8, 12, 0);
+    // BUTTERFLY( 9, 13, 2);
+    // BUTTERFLY(10, 14, 4);
+    // BUTTERFLY(11, 15, 6);
+    {
+        __shfl_half0(u, (int)y[1]); // 8,9,10,11    8,9,10,11
+        __shfl_half1(v, (int)y[1]); // 12,13,14,15  12,13,14,15
+        y[1] = ((get_local_id(0)&7) < 4) ? (u+v) : ((u-v) << (2*(get_local_id(0)&3)));
+    }
+
+    // DO_REDUCE(5);
+    // DO_REDUCE(7);
+    // DO_REDUCE(13);
+    // DO_REDUCE(15);
+    if ((get_local_id(0)&1) && (get_local_id(0)&7) >= 4) {
+        y[0] = REDUCE(y[0]);  // 5, 7
+        y[1] = REDUCE(y[1]);  // 13, 15
+    }
+
+    // BUTTERFLY( 0, 2, 0);
+    // BUTTERFLY( 1, 3, 4);
+    // BUTTERFLY( 4, 6, 0);
+    // BUTTERFLY( 5, 7, 4);
+    {
+        __shfl_half2(u, (int)y[0]); // 0,1,0,1  4,5,4,5
+        __shfl_half3(v, (int)y[0]); // 2,3,2,3  6,7,6,7
+        y[0] = ((get_local_id(0)&3) < 2) ? (u+v) : ((u-v) << (4*(get_local_id(0)&1)));
+    }
+
+    // BUTTERFLY( 8, 10, 0);
+    // BUTTERFLY( 9, 11, 4);
+    // BUTTERFLY(12, 14, 0);
+    // BUTTERFLY(13, 15, 4);
+    {
+        __shfl_half2(u, (int)y[1]); // 8,9,8,9      12,13,12,13
+        __shfl_half3(v, (int)y[1]); // 10,11,10,11  14,15,14,15
+        y[1] = ((get_local_id(0)&3) < 2) ? (u+v) : ((u-v) << (4*(get_local_id(0)&1)));
+    }
+
+    // BUTTERFLY( 0, 1, 0);
+    // BUTTERFLY( 2, 3, 0);
+    // BUTTERFLY( 4, 5, 0);
+    // BUTTERFLY( 6, 7, 0);
+    {
+        __shfl_half4(u, (int)y[0]); // 0,0,2,2      4,4,6,6
+        __shfl_half5(v, (int)y[0]); // 1,1,3,3      5,5,7,7
+        y[0] = ((get_local_id(0)&1) < 1) ? (u+v) : (u-v);
+    }
+
+    // BUTTERFLY( 8, 9, 0);
+    // BUTTERFLY(10, 11, 0);
+    // BUTTERFLY(12, 13, 0);
+    // BUTTERFLY(14, 15, 0);
+    {
+        __shfl_half4(u, (int)y[1]); // 8,8,10,10    12,12,14,14
+        __shfl_half5(v, (int)y[1]); // 9,9,11,11    13,13,15,15
+        y[1] = ((get_local_id(0)&1) < 1) ? (u+v) : (u-v);
+    }
+
+    DO_REDUCE_FULL_S( 0); // 0...7
+    DO_REDUCE_FULL_S( 1); // 8...15
+
+#undef DO_REDUCE_FULL_S
+}
+
+ void inline FFT_128_full(int *y)
+ {
+     int i;
+ 
+     FFT_8(y+0,2); // eight parallel FFT8's
+     FFT_8(y+1,2); // eight parallel FFT8's
+ 
+ #pragma unroll 16
+     for (i=0; i<16; i++)
+     /*if (i & 7)*/ y[i] = REDUCE(y[i]*c_FFT128_8_16_Twiddle[i*8+(get_local_id(0)&7)]);
+ 
+ #pragma unroll 8
+     for (i=0; i<8; i++)
+         FFT_16(y+2*i);  // eight sequential FFT16's, each one executed in parallel by 8 threads
+ }
+
+void inline FFT_256_halfzero(int *y)
+{
+    /*
+     * FFT_256 using w=41 as 256th root of unity.
+     * Decimation in frequency (DIF) NTT.
+     * Output data is in revbin_permuted order.
+     * In place.
+     */
+    const int tmp = y[15];
+
+#pragma unroll 8
+    for (int i=0; i<8; i++)
+        y[16+i] = REDUCE(y[i] * c_FFT256_2_128_Twiddle[8*i+(get_local_id(0)&7)]);
+#pragma unroll 8
+    for (int i=8; i<16; i++)
+        y[16+i] = 0;
+
+    /* handle X^255 with an additional butterfly */
+    if ((get_local_id(0)&7) == 7)
+    {
+        y[15] = REDUCE(tmp + 1);
+        y[31] = REDUCE((tmp - 1) * c_FFT256_2_128_Twiddle[127]);
+    }
+
+    FFT_128_full(y);
+    FFT_128_full(y+16);
+}
+
+ void Expansion(uint *data, __global uint4 *g_temp4)
+ {
+     uint plo, phi;
+     /* Message Expansion using Number Theoretical Transform similar to FFT */
+     int expanded[32];
+ #pragma unroll 4
+     for (int i=0; i < 4; i++) {
+         __shfl_lane_8(plo, (int)data[0], 2*i);
+         __shfl_lane_8(phi, (int)data[0], (2*i)+1);
+         expanded[  i] = __byte_perm(plo, phi, get_local_id(0)&7)&0xff;
+         __shfl_lane_8(plo, (int)data[1], 2*i);
+         __shfl_lane_8(phi, (int)data[1], (2*i)+1);
+         expanded[4+i] = __byte_perm(plo, phi, get_local_id(0)&7)&0xff;
+     }
+ #pragma unroll 8
+     for (int i=8; i < 16; i++)
+         expanded[i] = 0;
+ 
+     FFT_256_halfzero(expanded);
+ 
+     // store w matrices in global memory
+ 
+ #define mul_185(x) ( (x)*185 )
+ #define mul_233(x) ( (x)*233 )
+ 
+     uint4 vec0;
+     int p, q, p1, q1, p2, q2;
+     bool even = (get_local_id(0) & 1) == 0;
+ 
+ //  0   8   4  12   2  10   6  14      16  24  20  28  18  26  22  30         2 2 2 2 2 2 2 2     2 2 2 2 2 2 2 2
+ //  0   8   4  12   2  10   6  14      16  24  20  28  18  26  22  30         6 6 6 6 6 6 6 6     6 6 6 6 6 6 6 6
+ //  0   8   4  12   2  10   6  14      16  24  20  28  18  26  22  30         0 0 0 0 0 0 0 0     0 0 0 0 0 0 0 0
+ //  0   8   4  12   2  10   6  14      16  24  20  28  18  26  22  30         4 4 4 4 4 4 4 4     4 4 4 4 4 4 4 4
+ 
+     // 2 6 0 4
+ 
+     p1 = expanded[ 0]; __shfl_right1(p2, expanded[ 2]); p = even ? p1 : p2;
+     q1 = expanded[16]; __shfl_right1(q2, expanded[18]); q = even ? q1 : q2;
+     __shfl_cperm0(vec0.x, (int)__byte_perm(mul_185(p),  mul_185(q), 0x05040100));
+     p1 = expanded[ 8]; __shfl_right1(p2, expanded[10]); p = even ? p1 : p2;
+     q1 = expanded[24]; __shfl_right1(q2, expanded[26]); q = even ? q1 : q2;
+     __shfl_cperm0(vec0.y, (int)__byte_perm(mul_185(p),  mul_185(q), 0x05040100));
+     p1 = expanded[ 4]; __shfl_right1(p2, expanded[ 6]); p = even ? p1 : p2;
+     q1 = expanded[20]; __shfl_right1(q2, expanded[22]); q = even ? q1 : q2;
+     __shfl_cperm0(vec0.z, (int)__byte_perm(mul_185(p),  mul_185(q), 0x05040100));
+     p1 = expanded[12]; __shfl_right1(p2, expanded[14]); p = even ? p1 : p2;
+     q1 = expanded[28]; __shfl_right1(q2, expanded[30]); q = even ? q1 : q2;
+     __shfl_cperm0(vec0.w, (int)__byte_perm(mul_185(p),  mul_185(q), 0x05040100));
+     g_temp4[get_local_id(0)&7] = vec0;
+ 
+ //  1   9   5  13   3  11   7  15      17  25  21  29  19  27  23  31         6 6 6 6 6 6 6 6     6 6 6 6 6 6 6 6
+ //  1   9   5  13   3  11   7  15      17  25  21  29  19  27  23  31         2 2 2 2 2 2 2 2     2 2 2 2 2 2 2 2
+ //  1   9   5  13   3  11   7  15      17  25  21  29  19  27  23  31         4 4 4 4 4 4 4 4     4 4 4 4 4 4 4 4
+ //  1   9   5  13   3  11   7  15      17  25  21  29  19  27  23  31         0 0 0 0 0 0 0 0     0 0 0 0 0 0 0 0
+ 
+     // 6 2 4 0
+
+     p1 = expanded[ 1]; __shfl_right1(p2, expanded[ 3]); p = even ? p1 : p2;
+     q1 = expanded[17]; __shfl_right1(q2, expanded[19]); q = even ? q1 : q2;
+     __shfl_cperm1(vec0.x, (int)__byte_perm(mul_185(p),  mul_185(q), 0x05040100));
+     p1 = expanded[ 9]; __shfl_right1(p2, expanded[11]); p = even ? p1 : p2;
+     q1 = expanded[25]; __shfl_right1(q2, expanded[27]); q = even ? q1 : q2;
+     __shfl_cperm1(vec0.y, (int)__byte_perm(mul_185(p),  mul_185(q), 0x05040100));
+     p1 = expanded[ 5]; __shfl_right1(p2, expanded[ 7]); p = even ? p1 : p2;
+     q1 = expanded[21]; __shfl_right1(q2, expanded[23]); q = even ? q1 : q2;
+     __shfl_cperm1(vec0.z, (int)__byte_perm(mul_185(p),  mul_185(q), 0x05040100));
+     p1 = expanded[13]; __shfl_right1(p2, expanded[15]); p = even ? p1 : p2;
+     q1 = expanded[29]; __shfl_right1(q2, expanded[31]); q = even ? q1 : q2;
+     __shfl_cperm1(vec0.w, (int)__byte_perm(mul_185(p),  mul_185(q), 0x05040100));
+     g_temp4[8+(get_local_id(0)&7)] = vec0;
+ 
+ //  1   9   5  13   3  11   7  15      17  25  21  29  19  27  23  31         7 7 7 7 7 7 7 7     7 7 7 7 7 7 7 7
+ //  1   9   5  13   3  11   7  15      17  25  21  29  19  27  23  31         5 5 5 5 5 5 5 5     5 5 5 5 5 5 5 5
+ //  0   8   4  12   2  10   6  14      16  24  20  28  18  26  22  30         3 3 3 3 3 3 3 3     3 3 3 3 3 3 3 3
+ //  0   8   4  12   2  10   6  14      16  24  20  28  18  26  22  30         1 1 1 1 1 1 1 1     1 1 1 1 1 1 1 1
+ 
+     // 7 5 3 1
+ 
+     bool hi = (get_local_id(0)&7)>=4;
+ 
+     p1 = hi?expanded[ 1]:expanded[ 0]; __shfl_left1(p2, hi?expanded[ 3]:expanded[ 2]); p = !even ? p1 : p2;
+     q1 = hi?expanded[17]:expanded[16]; __shfl_left1(q2, hi?expanded[19]:expanded[18]); q = !even ? q1 : q2;
+     __shfl_cperm2(vec0.x, (int)__byte_perm(mul_185(p),  mul_185(q), 0x05040100));
+     p1 = hi?expanded[ 9]:expanded[ 8]; __shfl_left1(p2, hi?expanded[11]:expanded[10]); p = !even ? p1 : p2;
+     q1 = hi?expanded[25]:expanded[24]; __shfl_left1(q2, hi?expanded[27]:expanded[26]); q = !even ? q1 : q2;
+     __shfl_cperm2(vec0.y, (int)__byte_perm(mul_185(p),  mul_185(q), 0x05040100));
+     p1 = hi?expanded[ 5]:expanded[ 4]; __shfl_left1(p2, hi?expanded[ 7]:expanded[ 6]); p = !even ? p1 : p2;
+     q1 = hi?expanded[21]:expanded[20]; __shfl_left1(q2, hi?expanded[23]:expanded[22]); q = !even ? q1 : q2;
+     __shfl_cperm2(vec0.z, (int)__byte_perm(mul_185(p),  mul_185(q), 0x05040100));
+     p1 = hi?expanded[13]:expanded[12]; __shfl_left1(p2, hi?expanded[15]:expanded[14]); p = !even ? p1 : p2;
+     q1 = hi?expanded[29]:expanded[28]; __shfl_left1(q2, hi?expanded[31]:expanded[30]); q = !even ? q1 : q2;
+     __shfl_cperm2(vec0.w, (int)__byte_perm(mul_185(p),  mul_185(q), 0x05040100));
+     g_temp4[16+(get_local_id(0)&7)] = vec0;
+ 
+ //  1   9   5  13   3  11   7  15      17  25  21  29  19  27  23  31         1 1 1 1 1 1 1 1     1 1 1 1 1 1 1 1
+ //  1   9   5  13   3  11   7  15      17  25  21  29  19  27  23  31         3 3 3 3 3 3 3 3     3 3 3 3 3 3 3 3
+ //  0   8   4  12   2  10   6  14      16  24  20  28  18  26  22  30         5 5 5 5 5 5 5 5     5 5 5 5 5 5 5 5
+ //  0   8   4  12   2  10   6  14      16  24  20  28  18  26  22  30         7 7 7 7 7 7 7 7     7 7 7 7 7 7 7 7
+ 
+   // 1 3 5 7
+ 
+     bool lo = (get_local_id(0)&7)<4;
+ 
+     p1 = lo?expanded[ 1]:expanded[ 0]; __shfl_left1(p2, lo?expanded[ 3]:expanded[ 2]); p = !even ? p1 : p2;
+     q1 = lo?expanded[17]:expanded[16]; __shfl_left1(q2, lo?expanded[19]:expanded[18]); q = !even ? q1 : q2;
+     __shfl_cperm3(vec0.x, (int)__byte_perm(mul_185(p),  mul_185(q) , 0x05040100));
+     p1 = lo?expanded[ 9]:expanded[ 8]; __shfl_left1(p2, lo?expanded[11]:expanded[10]); p = !even ? p1 : p2;
+     q1 = lo?expanded[25]:expanded[24]; __shfl_left1(q2, lo?expanded[27]:expanded[26]); q = !even ? q1 : q2;
+     __shfl_cperm3(vec0.y, (int)__byte_perm(mul_185(p),  mul_185(q) , 0x05040100));
+     p1 = lo?expanded[ 5]:expanded[ 4]; __shfl_left1(p2, lo?expanded[ 7]:expanded[ 6]); p = !even ? p1 : p2;
+     q1 = lo?expanded[21]:expanded[20]; __shfl_left1(q2, lo?expanded[23]:expanded[22]); q = !even ? q1 : q2;
+     __shfl_cperm3(vec0.z, (int)__byte_perm(mul_185(p),  mul_185(q) , 0x05040100));
+     p1 = lo?expanded[13]:expanded[12]; __shfl_left1(p2, lo?expanded[15]:expanded[14]); p = !even ? p1 : p2;
+     q1 = lo?expanded[29]:expanded[28]; __shfl_left1(q2, lo?expanded[31]:expanded[30]); q = !even ? q1 : q2;
+     __shfl_cperm3(vec0.w, (int)__byte_perm(mul_185(p),  mul_185(q) , 0x05040100));
+     g_temp4[24+(get_local_id(0)&7)] = vec0;
+ 
+ //  1   9   5  13   3  11   7  15       1   9   5  13   3  11   7  15         0 0 0 0 0 0 0 0     1 1 1 1 1 1 1 1
+ //  0   8   4  12   2  10   6  14       0   8   4  12   2  10   6  14         4 4 4 4 4 4 4 4     5 5 5 5 5 5 5 5
+ //  1   9   5  13   3  11   7  15       1   9   5  13   3  11   7  15         6 6 6 6 6 6 6 6     7 7 7 7 7 7 7 7
+ //  0   8   4  12   2  10   6  14       0   8   4  12   2  10   6  14         2 2 2 2 2 2 2 2     3 3 3 3 3 3 3 3
+ 
+ //{ 8, 72, 40, 104, 24, 88, 56, 120 },   { 9, 73, 41, 105, 25, 89, 57, 121 },
+ //{ 4, 68, 36, 100, 20, 84, 52, 116 },   { 5, 69, 37, 101, 21, 85, 53, 117 },
+ //{ 14, 78, 46, 110, 30, 94, 62, 126 },  { 15, 79, 47, 111, 31, 95, 63, 127 },
+ //{ 2, 66, 34, 98, 18, 82, 50, 114 },    { 3, 67, 35, 99, 19, 83, 51, 115 },
+ 
+     bool sel = ((get_local_id(0)+2)&7) >= 4;  // 2,3,4,5
+ 
+     p1 = sel?expanded[0]:expanded[1]; __shfl_xor1(q1, p1);
+     q2 = sel?expanded[2]:expanded[3]; __shfl_xor1(p2, q2);
+     p = even? p1 : p2; q = even? q1 : q2;
+     __shfl_cperm4(vec0.x, (int)__byte_perm(mul_233(p),  mul_233(q) , 0x05040100));
+     p1 = sel?expanded[8]:expanded[9]; __shfl_xor1(q1, p1);
+     q2 = sel?expanded[10]:expanded[11]; __shfl_xor1(p2, q2);
+     p = even? p1 : p2; q = even? q1 : q2;
+     __shfl_cperm4(vec0.y, (int)__byte_perm(mul_233(p),  mul_233(q) , 0x05040100));
+     p1 = sel?expanded[4]:expanded[5]; __shfl_xor1(q1, p1);
+     q2 = sel?expanded[6]:expanded[7]; __shfl_xor1(p2, q2);
+     p = even? p1 : p2; q = even? q1 : q2;
+     __shfl_cperm4(vec0.z, (int)__byte_perm(mul_233(p),  mul_233(q) , 0x05040100));
+     p1 = sel?expanded[12]:expanded[13]; __shfl_xor1(q1, p1);
+     q2 = sel?expanded[14]:expanded[15]; __shfl_xor1(p2, q2);
+     p = even? p1 : p2; q = even? q1 : q2;
+     __shfl_cperm4(vec0.w, (int)__byte_perm(mul_233(p),  mul_233(q) , 0x05040100));
+     g_temp4[32+(get_local_id(0)&7)] = vec0;
+ 
+ //  0   8   4  12   2  10   6  14       0   8   4  12   2  10   6  14         6 6 6 6 6 6 6 6     7 7 7 7 7 7 7 7
+ //  1   9   5  13   3  11   7  15       1   9   5  13   3  11   7  15         2 2 2 2 2 2 2 2     3 3 3 3 3 3 3 3
+ //  0   8   4  12   2  10   6  14       0   8   4  12   2  10   6  14         0 0 0 0 0 0 0 0     1 1 1 1 1 1 1 1
+ //  1   9   5  13   3  11   7  15       1   9   5  13   3  11   7  15         4 4 4 4 4 4 4 4     5 5 5 5 5 5 5 5
+ 
+     p1 = sel?expanded[1]:expanded[0]; __shfl_xor1(q1, p1);
+     q2 = sel?expanded[3]:expanded[2]; __shfl_xor1(p2, q2);
+     p = even? p1 : p2; q = even? q1 : q2;
+     __shfl_cperm5(vec0.x, (int)__byte_perm(mul_233(p),  mul_233(q) , 0x05040100));
+     p1 = sel?expanded[9]:expanded[8]; __shfl_xor1(q1, p1);
+     q2 = sel?expanded[11]:expanded[10]; __shfl_xor1(p2, q2);
+     p = even? p1 : p2; q = even? q1 : q2;
+     __shfl_cperm5(vec0.y, (int)__byte_perm(mul_233(p),  mul_233(q) , 0x05040100));
+     p1 = sel?expanded[5]:expanded[4]; __shfl_xor1(q1, p1);
+     q2 = sel?expanded[7]:expanded[6]; __shfl_xor1(p2, q2);
+     p = even? p1 : p2; q = even? q1 : q2;
+     __shfl_cperm5(vec0.z, (int)__byte_perm(mul_233(p),  mul_233(q) , 0x05040100));
+     p1 = sel?expanded[13]:expanded[12]; __shfl_xor1(q1, p1);
+     q2 = sel?expanded[15]:expanded[14]; __shfl_xor1(p2, q2);
+     p = even? p1 : p2; q = even? q1 : q2;
+     __shfl_cperm5(vec0.w, (int)__byte_perm(mul_233(p),  mul_233(q) , 0x05040100));
+     g_temp4[40+(get_local_id(0)&7)] = vec0;
+ 
+ // 16  24  20  28  18  26  22  30      16  24  20  28  18  26  22  30         6 6 6 6 6 6 6 6     7 7 7 7 7 7 7 7
+ // 16  24  20  28  18  26  22  30      16  24  20  28  18  26  22  30         0 0 0 0 0 0 0 0     1 1 1 1 1 1 1 1
+ // 17  25  21  29  19  27  23  31      17  25  21  29  19  27  23  31         0 0 0 0 0 0 0 0     1 1 1 1 1 1 1 1
+ // 17  25  21  29  19  27  23  31      17  25  21  29  19  27  23  31         6 6 6 6 6 6 6 6     7 7 7 7 7 7 7 7
+ 
+     // sel markiert threads 2,3,4,5
+ 
+     int t;
+     __shfl_left4(t, expanded[17]); p1 = sel?t:expanded[16]; __shfl_xor1(q1, p1);
+     __shfl_left4(t, expanded[19]); q2 = sel?t:expanded[18]; __shfl_xor1(p2, q2);
+     p = even? p1 : p2; q = even? q1 : q2;
+     __shfl_cperm6(vec0.x, (int)__byte_perm(mul_233(p),  mul_233(q) , 0x05040100));
+     __shfl_left4(t, expanded[25]); p1 = sel?t:expanded[24]; __shfl_xor1(q1, p1);
+     __shfl_left4(t, expanded[27]); q2 = sel?t:expanded[26]; __shfl_xor1(p2, q2);
+     p = even? p1 : p2; q = even? q1 : q2;
+     __shfl_cperm6(vec0.y, (int)__byte_perm(mul_233(p),  mul_233(q) , 0x05040100));
+     __shfl_left4(t, expanded[21]); p1 = sel?t:expanded[20]; __shfl_xor1(q1, p1);
+     __shfl_left4(t, expanded[23]); q2 = sel?t:expanded[22]; __shfl_xor1(p2, q2);
+     p = even? p1 : p2; q = even? q1 : q2;
+     __shfl_cperm6(vec0.z, (int)__byte_perm(mul_233(p),  mul_233(q) , 0x05040100));
+     __shfl_left4(t, expanded[29]); p1 = sel?t:expanded[28]; __shfl_xor1(q1, p1);
+     __shfl_left4(t, expanded[31]); q2 = sel?t:expanded[30]; __shfl_xor1(p2, q2);
+     p = even? p1 : p2; q = even? q1 : q2;
+     __shfl_cperm6(vec0.w, (int)__byte_perm(mul_233(p),  mul_233(q) , 0x05040100));
+     g_temp4[48+(get_local_id(0)&7)] = vec0;
+ 
+ // 17  25  21  29  19  27  23  31      17  25  21  29  19  27  23  31         4 4 4 4 4 4 4 4     5 5 5 5 5 5 5 5
+ // 17  25  21  29  19  27  23  31      17  25  21  29  19  27  23  31         2 2 2 2 2 2 2 2     3 3 3 3 3 3 3 3
+ // 16  24  20  28  18  26  22  30      16  24  20  28  18  26  22  30         2 2 2 2 2 2 2 2     3 3 3 3 3 3 3 3
+ // 16  24  20  28  18  26  22  30      16  24  20  28  18  26  22  30         4 4 4 4 4 4 4 4     5 5 5 5 5 5 5 5
+ 
+     // sel markiert threads 2,3,4,5
+ 
+     __shfl_left4(t, expanded[16]); p1 = sel?expanded[17]:t; __shfl_xor1(q1, p1);
+     __shfl_left4(t, expanded[18]); q2 = sel?expanded[19]:t; __shfl_xor1(p2, q2);
+     p = even? p1 : p2; q = even? q1 : q2;
+     __shfl_cperm7(vec0.x, (int)__byte_perm(mul_233(p),  mul_233(q) , 0x05040100));
+     __shfl_left4(t, expanded[24]); p1 = sel?expanded[25]:t; __shfl_xor1(q1, p1);
+     __shfl_left4(t, expanded[26]); q2 = sel?expanded[27]:t; __shfl_xor1(p2, q2);
+     p = even? p1 : p2; q = even? q1 : q2;
+     __shfl_cperm7(vec0.y, (int)__byte_perm(mul_233(p),  mul_233(q) , 0x05040100));
+     __shfl_left4(t, expanded[20]); p1 = sel?expanded[21]:t; __shfl_xor1(q1, p1);
+     __shfl_left4(t, expanded[22]); q2 = sel?expanded[23]:t; __shfl_xor1(p2, q2);
+     p = even? p1 : p2; q = even? q1 : q2;
+     __shfl_cperm7(vec0.z, (int)__byte_perm(mul_233(p),  mul_233(q) , 0x05040100));
+     __shfl_left4(t, expanded[28]); p1 = sel?expanded[29]:t; __shfl_xor1(q1, p1);
+     __shfl_left4(t, expanded[30]); q2 = sel?expanded[31]:t; __shfl_xor1(p2, q2);
+     p = even? p1 : p2; q = even? q1 : q2;
+     __shfl_cperm7(vec0.w, (int)__byte_perm(mul_233(p),  mul_233(q) , 0x05040100));
+     g_temp4[56+(get_local_id(0)&7)] = vec0;
+ 
+ #undef mul_185
+ #undef mul_233
+ }
+
+// ======================================================================================================
+
+ #define IF(x, y, z) (((y ^ z) & x) ^ z)
+ #define MAJ(x, y, z) ((z &y) | ((z|y) & x))
+
+void inline STEP8_IF_0(const uint *w, const int r, const int s, uint * A, const uint * B, const uint * C, uint * D)
+{
+	uint temp;
+	uint R[8];
+#pragma unroll 8
+	for(int j=0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[1];
+	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[0];
+	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[3];
+	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[2];
+	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[5];
+	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[4];
+	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[7];
+	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[6];
+#pragma unroll 8
+	for(int j=0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+void inline STEP8_IF_1(const uint *w, const int r, const int s, uint * A, const uint * B, const uint * C, uint * D)
+{
+	uint temp;
+	uint R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[6];
+	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[7];
+	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[4];
+	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[5];
+	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[2];
+	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[3];
+	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[0];
+	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[1];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+void inline STEP8_IF_2(const uint *w, const int r, const int s, uint * A, const uint * B, const uint * C, uint * D)
+{
+	uint temp;
+	uint R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[2];
+	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[3];
+	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[0];
+	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[1];
+	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[6];
+	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[7];
+	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[4];
+	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[5];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+void inline STEP8_IF_3(const uint *w, const int r, const int s, uint * A, const uint * B, const uint * C, uint * D)
+{
+	uint temp;
+	uint R[8];
+#pragma unroll 8
+	for (int  j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[3];
+	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[2];
+	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[1];
+	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[0];
+	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[7];
+	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[6];
+	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[5];
+	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[4];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+void inline STEP8_MAJ_4(const uint *w, const int r, const int s, uint * A, const uint * B, const uint * C, uint * D)
+{
+	uint temp;
+	uint R[8];
+#pragma unroll 8
+	for (int  j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[5];
+	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[4];
+	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[7];
+	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[6];
+	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[1];
+	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[0];
+	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[3];
+	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[2];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+void inline STEP8_MAJ_5(const uint *w, const int r, const int s, uint * A, const uint * B, const uint * C, uint * D)
+{
+	uint temp;
+	uint R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[7];
+	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[6];
+	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[5];
+	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[4];
+	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[3];
+	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[2];
+	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[1];
+	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[0];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+void inline STEP8_MAJ_6(const uint *w, const int r, const int s, uint * A, const uint * B, const uint * C, uint * D)
+{
+	uint temp;
+	uint R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[4];
+	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[5];
+	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[6];
+	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[7];
+	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[0];
+	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[1];
+	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[2];
+	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[3];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+void inline STEP8_MAJ_7(const uint *w, const int r, const int s, uint * A, const uint * B, const uint * C, uint * D)
+{
+	uint temp;
+	uint R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[1];
+	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[0];
+	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[3];
+	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[2];
+	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[5];
+	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[4];
+	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[7];
+	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[6];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+void inline STEP8_IF_8(const uint *w, const int r, const int s, uint * A, const uint * B, const uint * C, uint * D)
+{
+	uint temp;
+	uint R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[6];
+	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[7];
+	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[4];
+	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[5];
+	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[2];
+	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[3];
+	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[0];
+	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[1];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+void inline STEP8_IF_9(const uint *w, const int r, const int s, uint * A, const uint * B, const uint * C, uint * D)
+{
+	uint temp;
+	uint R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[2];
+	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[3];
+	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[0];
+	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[1];
+	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[6];
+	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[7];
+	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[4];
+	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[5];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+void inline STEP8_IF_10(const uint *w, const int r, const int s, uint * A, const uint * B, const uint * C, uint * D)
+{
+
+	uint temp;
+	uint R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[3];
+	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[2];
+	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[1];
+	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[0];
+	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[7];
+	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[6];
+	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[5];
+	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[4];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+void inline STEP8_IF_11(const uint *w, const int r, const int s, uint * A, const uint * B, const uint * C, uint * D)
+{
+	uint temp;
+	uint R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[5];
+	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[4];
+	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[7];
+	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[6];
+	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[1];
+	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[0];
+	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[3];
+	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[2];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+void inline STEP8_MAJ_12(const uint *w, const int r, const int s, uint * A, const uint * B, const uint * C, uint * D)
+{
+	uint temp;
+	uint R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[7];
+	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[6];
+	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[5];
+	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[4];
+	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[3];
+	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[2];
+	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[1];
+	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[0];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+void inline STEP8_MAJ_13(const uint *w, const int r, const int s, uint * A, const uint * B, const uint * C, uint * D)
+{
+	uint temp;
+	uint R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[4];
+	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[5];
+	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[6];
+	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[7];
+	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[0];
+	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[1];
+	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[2];
+	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[3];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+void inline STEP8_MAJ_14(const uint *w, const int r, const int s, uint * A, const uint * B, const uint * C, uint * D)
+{
+	uint temp;
+	uint R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[1];
+	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[0];
+	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[3];
+	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[2];
+	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[5];
+	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[4];
+	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[7];
+	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[6];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+void inline STEP8_MAJ_15(const uint *w, const int r, const int s, uint * A, const uint * B, const uint * C, uint * D)
+{
+	uint temp;
+	uint R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[6];
+	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[7];
+	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[4];
+	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[5];
+	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[2];
+	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[3];
+	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[0];
+	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[1];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+void inline STEP8_IF_16(const uint *w, const int r, const int s, uint * A, const uint * B, const uint * C, uint * D)
+{
+	uint temp;
+	uint R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[2];
+	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[3];
+	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[0];
+	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[1];
+	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[6];
+	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[7];
+	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[4];
+	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[5];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+void inline STEP8_IF_17(const uint *w, const int r, const int s, uint * A, const uint * B, const uint * C, uint * D)
+{
+	uint temp;
+	uint R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[3];
+	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[2];
+	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[1];
+	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[0];
+	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[7];
+	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[6];
+	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[5];
+	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[4];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+void inline STEP8_IF_18(const uint *w, const int r, const int s, uint * A, const uint * B, const uint * C, uint * D)
+{
+	uint temp;
+	uint R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[5];
+	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[4];
+	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[7];
+	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[6];
+	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[1];
+	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[0];
+	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[3];
+	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[2];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+void inline STEP8_IF_19(const uint *w, const int r, const int s, uint * A, const uint * B, const uint * C, uint * D)
+{
+	uint temp;
+	uint R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[7];
+	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[6];
+	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[5];
+	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[4];
+	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[3];
+	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[2];
+	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[1];
+	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[0];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+void inline STEP8_MAJ_20(const uint *w, const int r, const int s, uint * A, const uint * B, const uint * C, uint * D)
+{
+	uint temp;
+	uint R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[4];
+	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[5];
+	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[6];
+	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[7];
+	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[0];
+	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[1];
+	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[2];
+	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[3];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+void inline STEP8_MAJ_21(const uint *w, const int r, const int s, uint * A, const uint * B, const uint * C, uint * D)
+{
+	uint temp;
+	uint R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[1];
+	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[0];
+	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[3];
+	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[2];
+	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[5];
+	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[4];
+	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[7];
+	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[6];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+void inline STEP8_MAJ_22(const uint *w, const int r, const int s, uint * A, const uint * B, const uint * C, uint * D)
+{
+	uint temp;
+	uint R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[6];
+	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[7];
+	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[4];
+	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[5];
+	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[2];
+	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[3];
+	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[0];
+	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[1];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+void inline STEP8_MAJ_23(const uint *w, const int r, const int s, uint * A, const uint * B, const uint * C, uint * D)
+{
+	uint temp;
+	uint R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[2];
+	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[3];
+	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[0];
+	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[1];
+	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[6];
+	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[7];
+	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[4];
+	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[5];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+void inline STEP8_IF_24(const uint *w, const int r, const int s, uint * A, const uint * B, const uint * C, uint * D)
+{
+	uint temp;
+	uint R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[3];
+	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[2];
+	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[1];
+	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[0];
+	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[7];
+	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[6];
+	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[5];
+	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[4];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+void inline STEP8_IF_25(const uint *w, const int r, const int s, uint * A, const uint * B, const uint * C, uint * D)
+{
+	uint temp;
+	uint R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[5];
+	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[4];
+	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[7];
+	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[6];
+	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[1];
+	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[0];
+	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[3];
+	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[2];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+void inline STEP8_IF_26(const uint *w, const int r, const int s, uint * A, const uint * B, const uint * C, uint * D)
+{
+	uint temp;
+	uint R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[7];
+	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[6];
+	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[5];
+	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[4];
+	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[3];
+	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[2];
+	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[1];
+	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[0];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+void inline STEP8_IF_27(const uint *w, const int r, const int s, uint * A, const uint * B, const uint * C, uint * D)
+{
+	uint temp;
+	uint R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[4];
+	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[5];
+	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[6];
+	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[7];
+	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[0];
+	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[1];
+	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[2];
+	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[3];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+void inline STEP8_MAJ_28(const uint *w, const int r, const int s, uint * A, const uint * B, const uint * C, uint * D)
+{
+	uint temp;
+	uint R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[1];
+	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[0];
+	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[3];
+	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[2];
+	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[5];
+	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[4];
+	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[7];
+	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[6];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+void inline STEP8_MAJ_29(const uint *w, const int r, const int s, uint * A, const uint * B, const uint * C, uint * D)
+{
+	uint temp;
+	uint R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[6];
+	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[7];
+	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[4];
+	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[5];
+	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[2];
+	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[3];
+	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[0];
+	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[1];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+void inline STEP8_MAJ_30(const uint *w, const int r, const int s, uint * A, const uint * B, const uint * C, uint * D)
+{
+	uint temp;
+	uint R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[2];
+	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[3];
+	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[0];
+	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[1];
+	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[6];
+	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[7];
+	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[4];
+	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[5];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+void inline STEP8_MAJ_31(const uint *w, const int r, const int s, uint * A, const uint * B, const uint * C, uint * D)
+{
+	uint temp;
+	uint R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[3];
+	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[2];
+	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[1];
+	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[0];
+	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[7];
+	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[6];
+	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[5];
+	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[4];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+void inline STEP8_IF_32(const uint *w, const int r, const int s, uint * A, const uint * B, const uint * C, uint * D)
+{
+	uint temp;
+	uint R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[5];
+	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[4];
+	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[7];
+	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[6];
+	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[1];
+	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[0];
+	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[3];
+	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[2];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+void inline STEP8_IF_33(const uint *w, const int r, const int s, uint * A, const uint * B, const uint * C, uint * D)
+{
+	uint temp;
+	uint R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[7];
+	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[6];
+	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[5];
+	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[4];
+	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[3];
+	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[2];
+	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[1];
+	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[0];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+void inline STEP8_IF_34(const uint *w, const int r, const int s, uint * A, const uint * B, const uint * C, uint * D)
+{
+	uint temp;
+	uint R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[4];
+	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[5];
+	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[6];
+	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[7];
+	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[0];
+	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[1];
+	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[2];
+	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[3];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+void inline STEP8_IF_35(const uint *w, const int r, const int s, uint * A, const uint * B, const uint * C, uint * D)
+{
+	uint temp;
+	uint R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[1];
+	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[0];
+	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[3];
+	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[2];
+	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[5];
+	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[4];
+	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[7];
+	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[6];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+
+ 
+
+void inline Round8_0_final(uint *A, int r, int s, int t, int u)
+{
+     const uint d_cw0[8][8] = {
+	0x531B1720, 0xAC2CDE09, 0x0B902D87, 0x2369B1F4, 0x2931AA01, 0x02E4B082, 0xC914C914, 0xC1DAE1A6,
+	0xF18C2B5C, 0x08AC306B, 0x27BFC914, 0xCEDC548D, 0xC630C4BE, 0xF18C4335, 0xF0D3427C, 0xBE3DA380,
+	0x143C02E4, 0xA948C630, 0xA4F2DE09, 0xA71D2085, 0xA439BD84, 0x109FCD6A, 0xEEA8EF61, 0xA5AB1CE8,
+	0x0B90D4A4, 0x3D6D039D, 0x25944D53, 0xBAA0E034, 0x5BC71E5A, 0xB1F4F2FE, 0x12CADE09, 0x548D41C3,
+	0x3CB4F80D, 0x36ECEBC4, 0xA66443EE, 0x43351ABD, 0xC7A20C49, 0xEB0BB366, 0xF5293F98, 0x49B6DE09,
+	0x531B29EA, 0x02E402E4, 0xDB25C405, 0x53D4E543, 0x0AD71720, 0xE1A61A04, 0xB87534C1, 0x3EDF43EE,
+	0x213E50F0, 0x39173EDF, 0xA9485B0E, 0xEEA82EF9, 0x14F55771, 0xFAF15546, 0x3D6DD9B3, 0xAB73B92E,
+	0x582A48FD, 0xEEA81892, 0x4F7EAA01, 0xAF10A88F, 0x11581720, 0x34C124DB, 0xD1C0AB73, 0x1E5AF0D3
+};
+	STEP8_IF_0(d_cw0[0], r, s, A, &A[8], &A[16], &A[24]);
+	STEP8_IF_1(d_cw0[1], s, t, &A[24], A, &A[8], &A[16]);
+	STEP8_IF_2(d_cw0[2], t, u, &A[16], &A[24], A, &A[8]);
+	STEP8_IF_3(d_cw0[3], u, r, &A[8], &A[16], &A[24], A);
+	STEP8_MAJ_4(d_cw0[4], r, s, A, &A[8], &A[16], &A[24]);
+	STEP8_MAJ_5(d_cw0[5], s, t, &A[24], A, &A[8], &A[16]);
+	STEP8_MAJ_6(d_cw0[6], t, u, &A[16], &A[24], A, &A[8]);
+	STEP8_MAJ_7(d_cw0[7], u, r, &A[8], &A[16], &A[24], A);
+}
+
+void inline Round8_1_final(uint *A, int r, int s, int t, int u)
+{
+     const uint d_cw1[8][8] = {
+	0xC34C07F3, 0xC914143C, 0x599CBC12, 0xBCCBE543, 0x385EF3B7, 0x14F54C9A, 0x0AD7C068, 0xB64A21F7,
+	0xDEC2AF10, 0xC6E9C121, 0x56B8A4F2, 0x1158D107, 0xEB0BA88F, 0x050FAABA, 0xC293264D, 0x548D46D2,
+	0xACE5E8E0, 0x53D421F7, 0xF470D279, 0xDC974E0C, 0xD6CF55FF, 0xFD1C4F7E, 0x36EC36EC, 0x3E261E5A,
+	0xEBC4FD1C, 0x56B839D0, 0x5B0E21F7, 0x58E3DF7B, 0x5BC7427C, 0xEF613296, 0x1158109F, 0x5A55E318,
+	0xA7D6B703, 0x1158E76E, 0xB08255FF, 0x50F05771, 0xEEA8E8E0, 0xCB3FDB25, 0x2E40548D, 0xE1A60F2D,
+	0xACE5D616, 0xFD1CFD1C, 0x24DB3BFB, 0xAC2C1ABD, 0xF529E8E0, 0x1E5AE5FC, 0x478BCB3F, 0xC121BC12,
+	0xF4702B5C, 0xC293FC63, 0xDA6CB2AD, 0x45601FCC, 0xA439E1A6, 0x4E0C0D02, 0xED3621F7, 0xAB73BE3D,
+	0x0E74D4A4, 0xF754CF95, 0xD84136EC, 0x3124AB73, 0x39D03B42, 0x0E74BCCB, 0x0F2DBD84, 0x41C35C80
+};
+	STEP8_IF_8(d_cw1[0], r, s, A, &A[8], &A[16], &A[24]);
+	STEP8_IF_9(d_cw1[1], s, t, &A[24], A, &A[8], &A[16]);
+	STEP8_IF_10(d_cw1[2], t, u, &A[16], &A[24], A, &A[8]);
+	STEP8_IF_11(d_cw1[3], u, r, &A[8], &A[16], &A[24], A);
+	STEP8_MAJ_12(d_cw1[4], r, s, A, &A[8], &A[16], &A[24]);
+	STEP8_MAJ_13(d_cw1[5], s, t, &A[24], A, &A[8], &A[16]);
+	STEP8_MAJ_14(d_cw1[6], t, u, &A[16], &A[24], A, &A[8]);
+	STEP8_MAJ_15(d_cw1[7], u, r, &A[8], &A[16], &A[24], A);
+}
+
+
+void inline Round8_2_final(uint *A, int r, int s, int t, int u)
+{
+      const uint d_cw2[8][8] = {
+	0xA4135BED, 0xE10E1EF2, 0x6C4F93B1, 0x6E2191DF, 0xE2E01D20, 0xD1952E6B, 0x6A7D9583, 0x131DECE3,
+	0x369CC964, 0xFB73048D, 0x9E9D6163, 0x280CD7F4, 0xD9C6263A, 0x1062EF9E, 0x2AC7D539, 0xAD2D52D3,
+	0x0A03F5FD, 0x197CE684, 0xAA72558E, 0xDE5321AD, 0xF0870F79, 0x607A9F86, 0xAFE85018, 0x2AC7D539,
+	0xE2E01D20, 0x2AC7D539, 0xC6A93957, 0x624C9DB4, 0x6C4F93B1, 0x641E9BE2, 0x452CBAD4, 0x263AD9C6,
+	0xC964369C, 0xC3053CFB, 0x452CBAD4, 0x95836A7D, 0x4AA2B55E, 0xAB5B54A5, 0xAC4453BC, 0x74808B80,
+	0xCB3634CA, 0xFC5C03A4, 0x4B8BB475, 0x21ADDE53, 0xE2E01D20, 0xDF3C20C4, 0xBD8F4271, 0xAA72558E,
+	0xFC5C03A4, 0x48D0B730, 0x2AC7D539, 0xD70B28F5, 0x53BCAC44, 0x3FB6C04A, 0x14EFEB11, 0xDB982468,
+	0x9A1065F0, 0xB0D14F2F, 0x8D5272AE, 0xC4D73B29, 0x91DF6E21, 0x949A6B66, 0x303DCFC3, 0x5932A6CE
+};
+	STEP8_IF_16(d_cw2[0], r, s, A, &A[8], &A[16], &A[24]);
+	STEP8_IF_17(d_cw2[1], s, t, &A[24], A, &A[8], &A[16]);
+	STEP8_IF_18(d_cw2[2], t, u, &A[16], &A[24], A, &A[8]);
+	STEP8_IF_19(d_cw2[3], u, r, &A[8], &A[16], &A[24], A);
+	STEP8_MAJ_20(d_cw2[4], r, s, A, &A[8], &A[16], &A[24]);
+	STEP8_MAJ_21(d_cw2[5], s, t, &A[24], A, &A[8], &A[16]);
+	STEP8_MAJ_22(d_cw2[6], t, u, &A[16], &A[24], A, &A[8]);
+	STEP8_MAJ_23(d_cw2[7], u, r, &A[8], &A[16], &A[24], A);
+}
+
+ 
+
+void inline Round8_3_final(uint *A, int r, int s, int t, int u)
+{
+     const uint d_cw3[8][8] = {
+	0x1234EDCC, 0xF5140AEC, 0xCDF1320F, 0x3DE4C21C, 0x48D0B730, 0x1234EDCC, 0x131DECE3, 0x52D3AD2D,
+	0xE684197C, 0x6D3892C8, 0x72AE8D52, 0x6FF3900D, 0x73978C69, 0xEB1114EF, 0x15D8EA28, 0x71C58E3B,
+	0x90F66F0A, 0x15D8EA28, 0x9BE2641E, 0x65F09A10, 0xEA2815D8, 0xBD8F4271, 0x3A40C5C0, 0xD9C6263A,
+	0xB38C4C74, 0xBAD4452C, 0x70DC8F24, 0xAB5B54A5, 0x46FEB902, 0x1A65E59B, 0x0DA7F259, 0xA32A5CD6,
+	0xD62229DE, 0xB81947E7, 0x6D3892C8, 0x15D8EA28, 0xE59B1A65, 0x065FF9A1, 0xB2A34D5D, 0x6A7D9583,
+	0x975568AB, 0xFC5C03A4, 0x2E6BD195, 0x966C6994, 0xF2590DA7, 0x263AD9C6, 0x5A1BA5E5, 0xB0D14F2F,
+	0x975568AB, 0x6994966C, 0xF1700E90, 0xD3672C99, 0xCC1F33E1, 0xFC5C03A4, 0x452CBAD4, 0x4E46B1BA,
+	0xF1700E90, 0xB2A34D5D, 0xD0AC2F54, 0x5760A8A0, 0x8C697397, 0x624C9DB4, 0xE85617AA, 0x95836A7D
+};
+	STEP8_IF_24(d_cw3[0], r, s, A, &A[8], &A[16], &A[24]);
+	STEP8_IF_25(d_cw3[1], s, t, &A[24], A, &A[8], &A[16]);
+	STEP8_IF_26(d_cw3[2], t, u, &A[16], &A[24], A, &A[8]);
+	STEP8_IF_27(d_cw3[3], u, r, &A[8], &A[16], &A[24], A);
+	STEP8_MAJ_28(d_cw3[4], r, s, A, &A[8], &A[16], &A[24]);
+	STEP8_MAJ_29(d_cw3[5], s, t, &A[24], A, &A[8], &A[16]);
+	STEP8_MAJ_30(d_cw3[6], t, u, &A[16], &A[24], A, &A[8]);
+	STEP8_MAJ_31(d_cw3[7], u, r, &A[8], &A[16], &A[24], A);
+}
+
+#define expanded_vector(x) (g_fft4[x])
+
+void inline Round8_0(uint *A, const int thr_offset,
+		int r, int s, int t, int u, __global uint4 *g_fft4) {
+	uint w[8];
+    uint4 hv1, hv2;
+
+	int tmp = 0 + thr_offset;
+	hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+	hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_IF_0(w, r, s, A, &A[8], &A[16], &A[24]);
+	hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+	hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_IF_1(w, s, t, &A[24], A, &A[8], &A[16]);
+	hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+	hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_IF_2(w, t, u, &A[16], &A[24], A, &A[8]);
+	hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+	hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_IF_3(w, u, r, &A[8], &A[16], &A[24], A);
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_MAJ_4(w, r, s, A, &A[8], &A[16], &A[24]);
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_MAJ_5(w, s, t, &A[24], A, &A[8], &A[16]);
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_MAJ_6(w, t, u, &A[16], &A[24], A, &A[8]);
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_MAJ_7(w, u, r, &A[8], &A[16], &A[24], A);
+
+
+}
+void inline Round8_1(uint *A, const int thr_offset,
+		int r, int s, int t, int u, __global uint4 *g_fft4) {
+	uint w[8];
+    uint4 hv1, hv2;
+
+	int tmp = 16 + thr_offset;
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_IF_8(w, r, s, A, &A[8], &A[16], &A[24]);
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_IF_9(w, s, t, &A[24], A, &A[8], &A[16]);
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_IF_10(w, t, u, &A[16], &A[24], A, &A[8]);
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_IF_11(w, u, r, &A[8], &A[16], &A[24], A);
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_MAJ_12(w, r, s, A, &A[8], &A[16], &A[24]);
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_MAJ_13(w, s, t, &A[24], A, &A[8], &A[16]);
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_MAJ_14(w, t, u, &A[16], &A[24], A, &A[8]);
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_MAJ_15(w, u, r, &A[8], &A[16], &A[24], A);
+
+
+}
+void inline Round8_2(uint *A, const int thr_offset,
+		int r, int s, int t, int u, __global uint4 *g_fft4) {
+	uint w[8];
+    uint4 hv1, hv2;
+
+	int tmp = 32 + thr_offset;
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_IF_16(w, r, s, A, &A[8], &A[16], &A[24]);
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_IF_17(w, s, t, &A[24], A, &A[8], &A[16]);
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_IF_18(w, t, u, &A[16], &A[24], A, &A[8]);
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_IF_19(w, u, r, &A[8], &A[16], &A[24], A);
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_MAJ_20(w, r, s, A, &A[8], &A[16], &A[24]);
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_MAJ_21(w, s, t, &A[24], A, &A[8], &A[16]);
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_MAJ_22(w, t, u, &A[16], &A[24], A, &A[8]);
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_MAJ_23(w, u, r, &A[8], &A[16], &A[24], A);
+
+
+}
+void inline Round8_3(uint *A, const int thr_offset,
+		int r, int s, int t, int u, __global uint4 *g_fft4) {
+	uint w[8];
+    uint4 hv1, hv2;
+
+	int tmp = 48 + thr_offset;
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_IF_24(w, r, s, A, &A[8], &A[16], &A[24]);
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_IF_25(w, s, t, &A[24], A, &A[8], &A[16]);
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_IF_26(w, t, u, &A[16], &A[24], A, &A[8]);
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_IF_27(w, u, r, &A[8], &A[16], &A[24], A);
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_MAJ_28(w, r, s, A, &A[8], &A[16], &A[24]);
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_MAJ_29(w, s, t, &A[24], A, &A[8], &A[16]);
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_MAJ_30(w, t, u, &A[16], &A[24], A, &A[8]);
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_MAJ_31(w, u, r, &A[8], &A[16], &A[24], A);
+
+
+}
+
+void inline SIMD_Compress1(uint *A, const int thr_id, const uint *M, __global uint4 *g_fft4) {
+	int i;
+	const int thr_offset = thr_id << 6; // thr_id * 128 (je zwei elemente)
+#pragma unroll 8
+	for(i=0; i<8; i++) {
+		A[i] ^= M[i];
+		(&A[8])[i] ^= M[8+i];
+	}
+	Round8_0(A, thr_offset, 3, 23, 17, 27, g_fft4);
+	Round8_1(A, thr_offset, 28, 19, 22, 7, g_fft4);
+}
+
+void Compression1(__global uint *hashval, const int texture_id, __global uint4 *g_fft4, __global uint *g_state) {
+	uint A[32];
+	int i;
+#pragma unroll 32
+	for (i=0; i < 32; i++) A[i] = c_IV_512[i];
+	uint buffer[16];
+#pragma unroll 16
+	for (i=0; i < 16; i++) buffer[i] = hashval[i];
+	SIMD_Compress1(A, texture_id, buffer, g_fft4);
+	__global uint *state = (__global uint*)&g_state[get_group_id(0) * (get_local_size(0)*32)];
+#pragma unroll 32
+	for (i=0; i < 32; i++) state[get_local_id(0)+get_local_size(0)*i] = A[i];
+}
+
+void inline SIMD_Compress2(uint *A, const int thr_id, __global uint4 *g_fft4) {
+	uint IV[4][8];
+	int i;
+	const int thr_offset = thr_id << 6; // thr_id * 128 (je zwei elemente)
+#pragma unroll 8
+	for(i=0; i<8; i++) {
+		IV[0][i] = c_IV_512[i];
+		IV[1][i] = c_IV_512[8+i];
+		IV[2][i] = c_IV_512[16+i];
+		IV[3][i] = c_IV_512[24+i];
+	}
+	Round8_2(A, thr_offset, 29, 9, 15, 5, g_fft4);
+	Round8_3(A, thr_offset, 4, 13, 10, 25, g_fft4);
+	STEP8_IF_32(IV[0],  4, 13, A, &A[8], &A[16], &A[24]);
+	STEP8_IF_33(IV[1], 13, 10, &A[24], A, &A[8], &A[16]);
+	STEP8_IF_34(IV[2], 10, 25, &A[16], &A[24], A, &A[8]);
+	STEP8_IF_35(IV[3], 25,  4, &A[8], &A[16], &A[24], A);
+}
+
+void Compression2(const int texture_id, __global uint4 *g_fft4, __global uint *g_state) {
+	uint A[32];
+	int i;
+	__global uint *state = &g_state[get_group_id(0) * (get_local_size(0)*32)];
+#pragma unroll 32
+	for (i=0; i < 32; i++) A[i] = state[get_local_id(0)+get_local_size(0)*i];
+	SIMD_Compress2(A, texture_id, g_fft4);
+#pragma unroll 32
+	for (i=0; i < 32; i++) state[get_local_id(0)+get_local_size(0)*i] = A[i];
+}
+
+void inline SIMD_Compress_Final(uint *A, const uint *M) {
+	uint IV[4][8];
+	int i;
+#pragma unroll 8
+	for(i=0; i<8; i++) {
+		IV[0][i] = A[i];
+		IV[1][i] = (&A[8])[i];
+		IV[2][i] = (&A[16])[i];
+		IV[3][i] = (&A[24])[i];
+	}
+#pragma unroll 8
+	for(i=0; i<8; i++) {
+		A[i] ^= M[i];
+		(&A[8])[i] ^= M[8+i];
+	}
+	Round8_0_final(A, 3, 23, 17, 27);
+	Round8_1_final(A, 28, 19, 22, 7);
+	Round8_2_final(A, 29, 9, 15, 5);
+	Round8_3_final(A, 4, 13, 10, 25);
+	STEP8_IF_32(IV[0],  4, 13, A, &A[8], &A[16], &A[24]);
+	STEP8_IF_33(IV[1], 13, 10, &A[24], A, &A[8], &A[16]);
+	STEP8_IF_34(IV[2], 10, 25, &A[16], &A[24], A, &A[8]);
+	STEP8_IF_35(IV[3], 25,  4, &A[8], &A[16], &A[24], A);
+}
+
+void Final(__global uint *hashval, const int texture_id, __global uint4 *g_fft4, __global uint *g_state) {
+	uint A[32];
+	int i;
+	__global uint *state = &g_state[get_group_id(0) * (get_local_size(0)*32)];
+#pragma unroll 32
+	for (i=0; i < 32; i++) A[i] = state[get_local_id(0)+get_local_size(0)*i];
+	uint buffer[16];
+	buffer[0] = 512;
+#pragma unroll 15
+	for (i=1; i < 16; i++) buffer[i] = 0;
+	SIMD_Compress_Final(A, buffer);
+#pragma unroll 16
+	for (i=0; i < 16; i++)
+		hashval[i] = A[i];
+}
diff --git a/ocl.c b/ocl.c
index 52de5ac..1375088 100644
--- a/ocl.c
+++ b/ocl.c
@@ -1032,6 +1032,11 @@ if (algorithm->type == ALGO_YESCRYPT || algorithm->type == ALGO_YESCRYPT_NAVI) {
     bufsize = (size_t)algorithm->rw_buffer_size;
     applog(LOG_DEBUG, "Buffer sizes: %lu RW, %lu R", (unsigned long)bufsize, (unsigned long)readbufsize);
   }
+  if (algorithm->type == ALGO_0X10_NAVI) {
+    buf1size = bufsize * 16;
+    buf2size = bufsize * 2;
+    applog(LOG_DEBUG, "0x10 buffer sizes: %lu RW, %lu RW", (unsigned long)bufsize, (unsigned long)bufsize);
+  }
 
   clState->padbuffer8 = NULL;
   clState->buffer1 = NULL;
@@ -1111,6 +1116,19 @@ if (algorithm->type == ALGO_YESCRYPT || algorithm->type == ALGO_YESCRYPT_NAVI) {
         return NULL;
       }
     }
+    else if (algorithm->type == ALGO_0X10_NAVI) {
+      // need additionnal buffers
+      clState->buffer1 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, buf1size, NULL, &status);
+      if (status != CL_SUCCESS && !clState->buffer1) {
+        applog(LOG_DEBUG, "Error %d: clCreateBuffer (buffer1), decrease TC or increase LG", status);
+        return NULL;
+      }
+      clState->buffer2 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, buf2size, NULL, &status);
+      if (status != CL_SUCCESS && !clState->buffer1) {
+        applog(LOG_DEBUG, "Error %d: clCreateBuffer (buffer2), decrease TC or increase LG", status);
+        return NULL;
+      }
+    }
     else if (algorithm->type == ALGO_PHI2 || algorithm->type == ALGO_PHI2_NAVI) {
       clState->buffer1 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, bufsize, NULL, &status);
       if (status != CL_SUCCESS && !clState->buffer1) {

From 69c914a7225f9d6997e26ced81bea1d41755a4d4 Mon Sep 17 00:00:00 2001
From: Changdao Dong <dongchangdao@gmail.com>
Date: Sat, 19 Mar 2022 01:21:04 -0700
Subject: [PATCH 2/4] Issue #266

minor fix
---
 driver-opencl.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/driver-opencl.c b/driver-opencl.c
index 213d27b..d6faea5 100644
--- a/driver-opencl.c
+++ b/driver-opencl.c
@@ -1609,7 +1609,7 @@ if (gpu->algorithm.type != ALGO_MTP && gpu->algorithm.type != ALGO_YESCRYPT_NAVI
       size_t globalOffset2[1];
       globalThreads2[0] = globalThreads[0] * 8;
       localThreads2[0] = 128;
-      globalOffset2[0] = (*p_global_work_offset) * 4;
+      globalOffset2[0] = (*p_global_work_offset) * 8;
       status = clEnqueueNDRangeKernel(clState->commandQueue, clState->extra_kernels[i], 1, globalOffset2,
         globalThreads2, localThreads2, 0, NULL, NULL);
     } else if (gpu->algorithm.type == ALGO_0X10_NAVI && (i == 8 || i == 9)) {
@@ -1618,7 +1618,7 @@ if (gpu->algorithm.type != ALGO_MTP && gpu->algorithm.type != ALGO_YESCRYPT_NAVI
       size_t globalOffset2[1];
       globalThreads2[0] = globalThreads[0];
       localThreads2[0] = 128;
-      globalOffset2[0] = (*p_global_work_offset) * 4;
+      globalOffset2[0] = (*p_global_work_offset);
       status = clEnqueueNDRangeKernel(clState->commandQueue, clState->extra_kernels[i], 1, globalOffset2,
         globalThreads2, localThreads2, 0, NULL, NULL);
     } else if (gpu->algorithm.type == ALGO_0X10 && i== 2 && clState->prebuilt) {

From 7730caf1d3865a9aea070cc8f85cb6e3149ff72c Mon Sep 17 00:00:00 2001
From: Changdao Dong <dongchangdao@gmail.com>
Date: Sun, 20 Mar 2022 20:08:29 -0700
Subject: [PATCH 3/4] Issue #266

simplifiy simd phase 2
---
 algorithm.c            |    7 +-
 driver-opencl.c        |   11 +-
 kernel/chainox_navi.cl |   24 +-
 kernel/simd_navi.cl    | 1563 +++++++---------------------------------
 ocl.c                  |    6 -
 5 files changed, 274 insertions(+), 1337 deletions(-)

diff --git a/algorithm.c b/algorithm.c
index 91fc888..2c29982 100644
--- a/algorithm.c
+++ b/algorithm.c
@@ -819,11 +819,6 @@ static cl_int queue_chainox_navi_kernel(struct __clState *clState, struct _dev_b
   // simd - search9
   CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
   CL_SET_ARG_N(1, clState->buffer1);
-  CL_SET_ARG_N(2, clState->buffer2);
-  // simd - search10
-  CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
-  CL_SET_ARG_N(1, clState->buffer1);
-  CL_SET_ARG_N(2, clState->buffer2);
   if (status != CL_SUCCESS) applog(LOG_ERR, "Error %d: search8 arg failed.", status);
   // shavite - search11
   CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
@@ -2687,7 +2682,7 @@ static algorithm_settings_t algos[] = {
 
   { "darkcoin-mod", ALGO_X11, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 10, 8 * 16 * 4194304, 0, darkcoin_regenhash, NULL, NULL, queue_darkcoin_mod_kernel, gen_hash, append_x11_compiler_options },
   { "chainox", ALGO_0X10, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 10, 8 * 16 * 4194304, 0, chainox_regenhash, NULL, NULL, queue_chainox_kernel, gen_hash, append_x11_compiler_options },
-  { "chainox_navi", ALGO_0X10_NAVI, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 12, 8 * 16 * 4194304, 0, chainox_regenhash, NULL, NULL, queue_chainox_navi_kernel, gen_hash, append_x11_compiler_options },
+  { "chainox_navi", ALGO_0X10_NAVI, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 11, 8 * 16 * 4194304, 0, chainox_regenhash, NULL, NULL, queue_chainox_navi_kernel, gen_hash, append_x11_compiler_options },
 
   { "sibcoin-mod", ALGO_X11, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 11, 2 * 16 * 4194304, 0, sibcoin_regenhash, NULL, NULL, queue_sibcoin_mod_kernel, gen_hash, append_x11_compiler_options },
   
diff --git a/driver-opencl.c b/driver-opencl.c
index d6faea5..3afd7cc 100644
--- a/driver-opencl.c
+++ b/driver-opencl.c
@@ -1603,7 +1603,7 @@ if (gpu->algorithm.type != ALGO_MTP && gpu->algorithm.type != ALGO_YESCRYPT_NAVI
       globalOffset2[0] = (*p_global_work_offset) * 4;
       status = clEnqueueNDRangeKernel(clState->commandQueue, clState->extra_kernels[i], 1, globalOffset2,
         globalThreads2, localThreads2, 0, NULL, NULL);
-    } else if (gpu->algorithm.type == ALGO_0X10_NAVI && i == 7) {
+    } else if (gpu->algorithm.type == ALGO_0X10_NAVI && (i == 7 || i == 8)) {
       size_t globalThreads2[1];
       size_t localThreads2[1];
       size_t globalOffset2[1];
@@ -1612,15 +1612,6 @@ if (gpu->algorithm.type != ALGO_MTP && gpu->algorithm.type != ALGO_YESCRYPT_NAVI
       globalOffset2[0] = (*p_global_work_offset) * 8;
       status = clEnqueueNDRangeKernel(clState->commandQueue, clState->extra_kernels[i], 1, globalOffset2,
         globalThreads2, localThreads2, 0, NULL, NULL);
-    } else if (gpu->algorithm.type == ALGO_0X10_NAVI && (i == 8 || i == 9)) {
-      size_t globalThreads2[1];
-      size_t localThreads2[1];
-      size_t globalOffset2[1];
-      globalThreads2[0] = globalThreads[0];
-      localThreads2[0] = 128;
-      globalOffset2[0] = (*p_global_work_offset);
-      status = clEnqueueNDRangeKernel(clState->commandQueue, clState->extra_kernels[i], 1, globalOffset2,
-        globalThreads2, localThreads2, 0, NULL, NULL);
     } else if (gpu->algorithm.type == ALGO_0X10 && i== 2 && clState->prebuilt) {
        size_t globalThreads2[1];
       size_t localThreads2[1];
diff --git a/kernel/chainox_navi.cl b/kernel/chainox_navi.cl
index 54f4b18..d124c2f 100644
--- a/kernel/chainox_navi.cl
+++ b/kernel/chainox_navi.cl
@@ -804,28 +804,18 @@ __kernel void search8(__global uint* g_hash, __global uint4 *g_temp4)
 }
 
 __attribute__((reqd_work_group_size(128, 1, 1)))
-__kernel void search9(__global uint* g_hash, __global uint4 *g_fft4, __global uint *g_state)
+__kernel void search9(__global uint* g_hash, __global uint4 *g_fft4)
 {
   uint gid = get_global_id(0);
-  uint thread = gid - get_global_offset(0);
-
-  __global uint *Hash = &g_hash[thread * 16];
-  Compression1(Hash, thread, g_fft4, g_state);
-  Compression2(thread, g_fft4, g_state);
-}
-
-__attribute__((reqd_work_group_size(128, 1, 1)))
-__kernel void search10(__global uint* g_hash, __global uint4 *g_fft4, __global uint *g_state)
-{
-  uint gid = get_global_id(0);
-  uint thread = gid - get_global_offset(0);
+  int threadBloc = (gid-get_global_offset(0)) / 8;
+  int hashPosition = threadBloc * 16;
+  __global uint *Hash = &g_hash[hashPosition];
 
-  __global uint *Hash = &g_hash[thread * 16];
-  Final(Hash, thread, g_fft4, g_state);
+x11_simd512_gpu_compress_64(Hash, g_fft4);
 }
 
 __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
-__kernel void search11(__global hash_t* hashes)
+__kernel void search10(__global hash_t* hashes)
 {
   uint gid = get_global_id(0);
   __global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
@@ -907,7 +897,7 @@ __kernel void search11(__global hash_t* hashes)
 
 
 __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
-__kernel void search12(__global hash_t* hashes, __global uint* output, const ulong target)
+__kernel void search11(__global hash_t* hashes, __global uint* output, const ulong target)
 {
   uint gid = get_global_id(0);
   __global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
diff --git a/kernel/simd_navi.cl b/kernel/simd_navi.cl
index 78c97f7..3cf48dd 100644
--- a/kernel/simd_navi.cl
+++ b/kernel/simd_navi.cl
@@ -696,1376 +696,343 @@ void inline FFT_256_halfzero(int *y)
 
 // ======================================================================================================
 
- #define IF(x, y, z) (((y ^ z) & x) ^ z)
- #define MAJ(x, y, z) ((z &y) | ((z|y) & x))
-
-void inline STEP8_IF_0(const uint *w, const int r, const int s, uint * A, const uint * B, const uint * C, uint * D)
-{
-	uint temp;
-	uint R[8];
-#pragma unroll 8
-	for(int j=0; j<8; j++) {
-		R[j] = ROTL32(A[j], r);
-	}
-	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
-	D[0] = ROTL32(temp, s) + R[1];
-	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
-	D[1] = ROTL32(temp, s) + R[0];
-	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
-	D[2] = ROTL32(temp, s) + R[3];
-	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
-	D[3] = ROTL32(temp, s) + R[2];
-	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
-	D[4] = ROTL32(temp, s) + R[5];
-	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
-	D[5] = ROTL32(temp, s) + R[4];
-	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
-	D[6] = ROTL32(temp, s) + R[7];
-	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
-	D[7] = ROTL32(temp, s) + R[6];
-#pragma unroll 8
-	for(int j=0; j<8; j++) {
-		A[j] = R[j];
-	}
-}
-void inline STEP8_IF_1(const uint *w, const int r, const int s, uint * A, const uint * B, const uint * C, uint * D)
-{
-	uint temp;
-	uint R[8];
-#pragma unroll 8
-	for (int j = 0; j<8; j++) {
-		R[j] = ROTL32(A[j], r);
-	}
-	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
-	D[0] = ROTL32(temp, s) + R[6];
-	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
-	D[1] = ROTL32(temp, s) + R[7];
-	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
-	D[2] = ROTL32(temp, s) + R[4];
-	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
-	D[3] = ROTL32(temp, s) + R[5];
-	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
-	D[4] = ROTL32(temp, s) + R[2];
-	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
-	D[5] = ROTL32(temp, s) + R[3];
-	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
-	D[6] = ROTL32(temp, s) + R[0];
-	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
-	D[7] = ROTL32(temp, s) + R[1];
-#pragma unroll 8
-	for (int j = 0; j<8; j++) {
-		A[j] = R[j];
-	}
-}
-void inline STEP8_IF_2(const uint *w, const int r, const int s, uint * A, const uint * B, const uint * C, uint * D)
-{
-	uint temp;
-	uint R[8];
-#pragma unroll 8
-	for (int j = 0; j<8; j++) {
-		R[j] = ROTL32(A[j], r);
-	}
-	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
-	D[0] = ROTL32(temp, s) + R[2];
-	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
-	D[1] = ROTL32(temp, s) + R[3];
-	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
-	D[2] = ROTL32(temp, s) + R[0];
-	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
-	D[3] = ROTL32(temp, s) + R[1];
-	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
-	D[4] = ROTL32(temp, s) + R[6];
-	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
-	D[5] = ROTL32(temp, s) + R[7];
-	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
-	D[6] = ROTL32(temp, s) + R[4];
-	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
-	D[7] = ROTL32(temp, s) + R[5];
-#pragma unroll 8
-	for (int j = 0; j<8; j++) {
-		A[j] = R[j];
-	}
-}
-void inline STEP8_IF_3(const uint *w, const int r, const int s, uint * A, const uint * B, const uint * C, uint * D)
-{
-	uint temp;
-	uint R[8];
-#pragma unroll 8
-	for (int  j = 0; j<8; j++) {
-		R[j] = ROTL32(A[j], r);
-	}
-	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
-	D[0] = ROTL32(temp, s) + R[3];
-	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
-	D[1] = ROTL32(temp, s) + R[2];
-	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
-	D[2] = ROTL32(temp, s) + R[1];
-	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
-	D[3] = ROTL32(temp, s) + R[0];
-	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
-	D[4] = ROTL32(temp, s) + R[7];
-	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
-	D[5] = ROTL32(temp, s) + R[6];
-	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
-	D[6] = ROTL32(temp, s) + R[5];
-	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
-	D[7] = ROTL32(temp, s) + R[4];
-#pragma unroll 8
-	for (int j = 0; j<8; j++) {
-		A[j] = R[j];
-	}
-}
-void inline STEP8_MAJ_4(const uint *w, const int r, const int s, uint * A, const uint * B, const uint * C, uint * D)
-{
-	uint temp;
-	uint R[8];
-#pragma unroll 8
-	for (int  j = 0; j<8; j++) {
-		R[j] = ROTL32(A[j], r);
-	}
-	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
-	D[0] = ROTL32(temp, s) + R[5];
-	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
-	D[1] = ROTL32(temp, s) + R[4];
-	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
-	D[2] = ROTL32(temp, s) + R[7];
-	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
-	D[3] = ROTL32(temp, s) + R[6];
-	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
-	D[4] = ROTL32(temp, s) + R[1];
-	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
-	D[5] = ROTL32(temp, s) + R[0];
-	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
-	D[6] = ROTL32(temp, s) + R[3];
-	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
-	D[7] = ROTL32(temp, s) + R[2];
-#pragma unroll 8
-	for (int j = 0; j<8; j++) {
-		A[j] = R[j];
-	}
-}
-void inline STEP8_MAJ_5(const uint *w, const int r, const int s, uint * A, const uint * B, const uint * C, uint * D)
-{
-	uint temp;
-	uint R[8];
-#pragma unroll 8
-	for (int j = 0; j<8; j++) {
-		R[j] = ROTL32(A[j], r);
-	}
-	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
-	D[0] = ROTL32(temp, s) + R[7];
-	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
-	D[1] = ROTL32(temp, s) + R[6];
-	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
-	D[2] = ROTL32(temp, s) + R[5];
-	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
-	D[3] = ROTL32(temp, s) + R[4];
-	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
-	D[4] = ROTL32(temp, s) + R[3];
-	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
-	D[5] = ROTL32(temp, s) + R[2];
-	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
-	D[6] = ROTL32(temp, s) + R[1];
-	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
-	D[7] = ROTL32(temp, s) + R[0];
-#pragma unroll 8
-	for (int j = 0; j<8; j++) {
-		A[j] = R[j];
-	}
-}
-void inline STEP8_MAJ_6(const uint *w, const int r, const int s, uint * A, const uint * B, const uint * C, uint * D)
-{
-	uint temp;
-	uint R[8];
-#pragma unroll 8
-	for (int j = 0; j<8; j++) {
-		R[j] = ROTL32(A[j], r);
-	}
-	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
-	D[0] = ROTL32(temp, s) + R[4];
-	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
-	D[1] = ROTL32(temp, s) + R[5];
-	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
-	D[2] = ROTL32(temp, s) + R[6];
-	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
-	D[3] = ROTL32(temp, s) + R[7];
-	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
-	D[4] = ROTL32(temp, s) + R[0];
-	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
-	D[5] = ROTL32(temp, s) + R[1];
-	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
-	D[6] = ROTL32(temp, s) + R[2];
-	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
-	D[7] = ROTL32(temp, s) + R[3];
-#pragma unroll 8
-	for (int j = 0; j<8; j++) {
-		A[j] = R[j];
-	}
-}
-void inline STEP8_MAJ_7(const uint *w, const int r, const int s, uint * A, const uint * B, const uint * C, uint * D)
-{
-	uint temp;
-	uint R[8];
-#pragma unroll 8
-	for (int j = 0; j<8; j++) {
-		R[j] = ROTL32(A[j], r);
-	}
-	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
-	D[0] = ROTL32(temp, s) + R[1];
-	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
-	D[1] = ROTL32(temp, s) + R[0];
-	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
-	D[2] = ROTL32(temp, s) + R[3];
-	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
-	D[3] = ROTL32(temp, s) + R[2];
-	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
-	D[4] = ROTL32(temp, s) + R[5];
-	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
-	D[5] = ROTL32(temp, s) + R[4];
-	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
-	D[6] = ROTL32(temp, s) + R[7];
-	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
-	D[7] = ROTL32(temp, s) + R[6];
-#pragma unroll 8
-	for (int j = 0; j<8; j++) {
-		A[j] = R[j];
-	}
-}
-void inline STEP8_IF_8(const uint *w, const int r, const int s, uint * A, const uint * B, const uint * C, uint * D)
-{
-	uint temp;
-	uint R[8];
-#pragma unroll 8
-	for (int j = 0; j<8; j++) {
-		R[j] = ROTL32(A[j], r);
-	}
-	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
-	D[0] = ROTL32(temp, s) + R[6];
-	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
-	D[1] = ROTL32(temp, s) + R[7];
-	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
-	D[2] = ROTL32(temp, s) + R[4];
-	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
-	D[3] = ROTL32(temp, s) + R[5];
-	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
-	D[4] = ROTL32(temp, s) + R[2];
-	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
-	D[5] = ROTL32(temp, s) + R[3];
-	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
-	D[6] = ROTL32(temp, s) + R[0];
-	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
-	D[7] = ROTL32(temp, s) + R[1];
-#pragma unroll 8
-	for (int j = 0; j<8; j++) {
-		A[j] = R[j];
-	}
-}
-void inline STEP8_IF_9(const uint *w, const int r, const int s, uint * A, const uint * B, const uint * C, uint * D)
+static inline uint IF(uint x, uint y, uint z)
 {
-	uint temp;
-	uint R[8];
-#pragma unroll 8
-	for (int j = 0; j<8; j++) {
-		R[j] = ROTL32(A[j], r);
-	}
-	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
-	D[0] = ROTL32(temp, s) + R[2];
-	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
-	D[1] = ROTL32(temp, s) + R[3];
-	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
-	D[2] = ROTL32(temp, s) + R[0];
-	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
-	D[3] = ROTL32(temp, s) + R[1];
-	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
-	D[4] = ROTL32(temp, s) + R[6];
-	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
-	D[5] = ROTL32(temp, s) + R[7];
-	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
-	D[6] = ROTL32(temp, s) + R[4];
-	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
-	D[7] = ROTL32(temp, s) + R[5];
-#pragma unroll 8
-	for (int j = 0; j<8; j++) {
-		A[j] = R[j];
-	}
+	return (((y ^ z) & x) ^ z);
 }
-void inline STEP8_IF_10(const uint *w, const int r, const int s, uint * A, const uint * B, const uint * C, uint * D)
-{
 
-	uint temp;
-	uint R[8];
-#pragma unroll 8
-	for (int j = 0; j<8; j++) {
-		R[j] = ROTL32(A[j], r);
-	}
-	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
-	D[0] = ROTL32(temp, s) + R[3];
-	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
-	D[1] = ROTL32(temp, s) + R[2];
-	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
-	D[2] = ROTL32(temp, s) + R[1];
-	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
-	D[3] = ROTL32(temp, s) + R[0];
-	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
-	D[4] = ROTL32(temp, s) + R[7];
-	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
-	D[5] = ROTL32(temp, s) + R[6];
-	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
-	D[6] = ROTL32(temp, s) + R[5];
-	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
-	D[7] = ROTL32(temp, s) + R[4];
-#pragma unroll 8
-	for (int j = 0; j<8; j++) {
-		A[j] = R[j];
-	}
-}
-void inline STEP8_IF_11(const uint *w, const int r, const int s, uint * A, const uint * B, const uint * C, uint * D)
-{
-	uint temp;
-	uint R[8];
-#pragma unroll 8
-	for (int j = 0; j<8; j++) {
-		R[j] = ROTL32(A[j], r);
-	}
-	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
-	D[0] = ROTL32(temp, s) + R[5];
-	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
-	D[1] = ROTL32(temp, s) + R[4];
-	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
-	D[2] = ROTL32(temp, s) + R[7];
-	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
-	D[3] = ROTL32(temp, s) + R[6];
-	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
-	D[4] = ROTL32(temp, s) + R[1];
-	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
-	D[5] = ROTL32(temp, s) + R[0];
-	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
-	D[6] = ROTL32(temp, s) + R[3];
-	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
-	D[7] = ROTL32(temp, s) + R[2];
-#pragma unroll 8
-	for (int j = 0; j<8; j++) {
-		A[j] = R[j];
-	}
-}
-void inline STEP8_MAJ_12(const uint *w, const int r, const int s, uint * A, const uint * B, const uint * C, uint * D)
-{
-	uint temp;
-	uint R[8];
-#pragma unroll 8
-	for (int j = 0; j<8; j++) {
-		R[j] = ROTL32(A[j], r);
-	}
-	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
-	D[0] = ROTL32(temp, s) + R[7];
-	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
-	D[1] = ROTL32(temp, s) + R[6];
-	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
-	D[2] = ROTL32(temp, s) + R[5];
-	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
-	D[3] = ROTL32(temp, s) + R[4];
-	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
-	D[4] = ROTL32(temp, s) + R[3];
-	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
-	D[5] = ROTL32(temp, s) + R[2];
-	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
-	D[6] = ROTL32(temp, s) + R[1];
-	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
-	D[7] = ROTL32(temp, s) + R[0];
-#pragma unroll 8
-	for (int j = 0; j<8; j++) {
-		A[j] = R[j];
-	}
-}
-void inline STEP8_MAJ_13(const uint *w, const int r, const int s, uint * A, const uint * B, const uint * C, uint * D)
-{
-	uint temp;
-	uint R[8];
-#pragma unroll 8
-	for (int j = 0; j<8; j++) {
-		R[j] = ROTL32(A[j], r);
-	}
-	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
-	D[0] = ROTL32(temp, s) + R[4];
-	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
-	D[1] = ROTL32(temp, s) + R[5];
-	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
-	D[2] = ROTL32(temp, s) + R[6];
-	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
-	D[3] = ROTL32(temp, s) + R[7];
-	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
-	D[4] = ROTL32(temp, s) + R[0];
-	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
-	D[5] = ROTL32(temp, s) + R[1];
-	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
-	D[6] = ROTL32(temp, s) + R[2];
-	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
-	D[7] = ROTL32(temp, s) + R[3];
-#pragma unroll 8
-	for (int j = 0; j<8; j++) {
-		A[j] = R[j];
-	}
-}
-void inline STEP8_MAJ_14(const uint *w, const int r, const int s, uint * A, const uint * B, const uint * C, uint * D)
-{
-	uint temp;
-	uint R[8];
-#pragma unroll 8
-	for (int j = 0; j<8; j++) {
-		R[j] = ROTL32(A[j], r);
-	}
-	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
-	D[0] = ROTL32(temp, s) + R[1];
-	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
-	D[1] = ROTL32(temp, s) + R[0];
-	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
-	D[2] = ROTL32(temp, s) + R[3];
-	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
-	D[3] = ROTL32(temp, s) + R[2];
-	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
-	D[4] = ROTL32(temp, s) + R[5];
-	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
-	D[5] = ROTL32(temp, s) + R[4];
-	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
-	D[6] = ROTL32(temp, s) + R[7];
-	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
-	D[7] = ROTL32(temp, s) + R[6];
-#pragma unroll 8
-	for (int j = 0; j<8; j++) {
-		A[j] = R[j];
-	}
-}
-void inline STEP8_MAJ_15(const uint *w, const int r, const int s, uint * A, const uint * B, const uint * C, uint * D)
-{
-	uint temp;
-	uint R[8];
-#pragma unroll 8
-	for (int j = 0; j<8; j++) {
-		R[j] = ROTL32(A[j], r);
-	}
-	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
-	D[0] = ROTL32(temp, s) + R[6];
-	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
-	D[1] = ROTL32(temp, s) + R[7];
-	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
-	D[2] = ROTL32(temp, s) + R[4];
-	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
-	D[3] = ROTL32(temp, s) + R[5];
-	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
-	D[4] = ROTL32(temp, s) + R[2];
-	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
-	D[5] = ROTL32(temp, s) + R[3];
-	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
-	D[6] = ROTL32(temp, s) + R[0];
-	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
-	D[7] = ROTL32(temp, s) + R[1];
-#pragma unroll 8
-	for (int j = 0; j<8; j++) {
-		A[j] = R[j];
-	}
-}
-void inline STEP8_IF_16(const uint *w, const int r, const int s, uint * A, const uint * B, const uint * C, uint * D)
-{
-	uint temp;
-	uint R[8];
-#pragma unroll 8
-	for (int j = 0; j<8; j++) {
-		R[j] = ROTL32(A[j], r);
-	}
-	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
-	D[0] = ROTL32(temp, s) + R[2];
-	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
-	D[1] = ROTL32(temp, s) + R[3];
-	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
-	D[2] = ROTL32(temp, s) + R[0];
-	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
-	D[3] = ROTL32(temp, s) + R[1];
-	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
-	D[4] = ROTL32(temp, s) + R[6];
-	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
-	D[5] = ROTL32(temp, s) + R[7];
-	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
-	D[6] = ROTL32(temp, s) + R[4];
-	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
-	D[7] = ROTL32(temp, s) + R[5];
-#pragma unroll 8
-	for (int j = 0; j<8; j++) {
-		A[j] = R[j];
-	}
-}
-void inline STEP8_IF_17(const uint *w, const int r, const int s, uint * A, const uint * B, const uint * C, uint * D)
-{
-	uint temp;
-	uint R[8];
-#pragma unroll 8
-	for (int j = 0; j<8; j++) {
-		R[j] = ROTL32(A[j], r);
-	}
-	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
-	D[0] = ROTL32(temp, s) + R[3];
-	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
-	D[1] = ROTL32(temp, s) + R[2];
-	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
-	D[2] = ROTL32(temp, s) + R[1];
-	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
-	D[3] = ROTL32(temp, s) + R[0];
-	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
-	D[4] = ROTL32(temp, s) + R[7];
-	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
-	D[5] = ROTL32(temp, s) + R[6];
-	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
-	D[6] = ROTL32(temp, s) + R[5];
-	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
-	D[7] = ROTL32(temp, s) + R[4];
-#pragma unroll 8
-	for (int j = 0; j<8; j++) {
-		A[j] = R[j];
-	}
-}
-void inline STEP8_IF_18(const uint *w, const int r, const int s, uint * A, const uint * B, const uint * C, uint * D)
-{
-	uint temp;
-	uint R[8];
-#pragma unroll 8
-	for (int j = 0; j<8; j++) {
-		R[j] = ROTL32(A[j], r);
-	}
-	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
-	D[0] = ROTL32(temp, s) + R[5];
-	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
-	D[1] = ROTL32(temp, s) + R[4];
-	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
-	D[2] = ROTL32(temp, s) + R[7];
-	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
-	D[3] = ROTL32(temp, s) + R[6];
-	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
-	D[4] = ROTL32(temp, s) + R[1];
-	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
-	D[5] = ROTL32(temp, s) + R[0];
-	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
-	D[6] = ROTL32(temp, s) + R[3];
-	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
-	D[7] = ROTL32(temp, s) + R[2];
-#pragma unroll 8
-	for (int j = 0; j<8; j++) {
-		A[j] = R[j];
-	}
-}
-void inline STEP8_IF_19(const uint *w, const int r, const int s, uint * A, const uint * B, const uint * C, uint * D)
-{
-	uint temp;
-	uint R[8];
-#pragma unroll 8
-	for (int j = 0; j<8; j++) {
-		R[j] = ROTL32(A[j], r);
-	}
-	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
-	D[0] = ROTL32(temp, s) + R[7];
-	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
-	D[1] = ROTL32(temp, s) + R[6];
-	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
-	D[2] = ROTL32(temp, s) + R[5];
-	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
-	D[3] = ROTL32(temp, s) + R[4];
-	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
-	D[4] = ROTL32(temp, s) + R[3];
-	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
-	D[5] = ROTL32(temp, s) + R[2];
-	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
-	D[6] = ROTL32(temp, s) + R[1];
-	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
-	D[7] = ROTL32(temp, s) + R[0];
-#pragma unroll 8
-	for (int j = 0; j<8; j++) {
-		A[j] = R[j];
-	}
+static inline uint MAJ(const uint x, const uint y, const uint z){
+	return ((z &y) | ((z | y) & x));
 }
-void inline STEP8_MAJ_20(const uint *w, const int r, const int s, uint * A, const uint * B, const uint * C, uint * D)
-{
-	uint temp;
-	uint R[8];
-#pragma unroll 8
-	for (int j = 0; j<8; j++) {
-		R[j] = ROTL32(A[j], r);
-	}
-	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
-	D[0] = ROTL32(temp, s) + R[4];
-	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
-	D[1] = ROTL32(temp, s) + R[5];
-	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
-	D[2] = ROTL32(temp, s) + R[6];
-	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
-	D[3] = ROTL32(temp, s) + R[7];
-	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
-	D[4] = ROTL32(temp, s) + R[0];
-	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
-	D[5] = ROTL32(temp, s) + R[1];
-	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
-	D[6] = ROTL32(temp, s) + R[2];
-	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
-	D[7] = ROTL32(temp, s) + R[3];
-#pragma unroll 8
-	for (int j = 0; j<8; j++) {
-		A[j] = R[j];
-	}
-}
-void inline STEP8_MAJ_21(const uint *w, const int r, const int s, uint * A, const uint * B, const uint * C, uint * D)
-{
-	uint temp;
-	uint R[8];
-#pragma unroll 8
-	for (int j = 0; j<8; j++) {
-		R[j] = ROTL32(A[j], r);
-	}
-	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
-	D[0] = ROTL32(temp, s) + R[1];
-	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
-	D[1] = ROTL32(temp, s) + R[0];
-	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
-	D[2] = ROTL32(temp, s) + R[3];
-	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
-	D[3] = ROTL32(temp, s) + R[2];
-	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
-	D[4] = ROTL32(temp, s) + R[5];
-	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
-	D[5] = ROTL32(temp, s) + R[4];
-	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
-	D[6] = ROTL32(temp, s) + R[7];
-	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
-	D[7] = ROTL32(temp, s) + R[6];
-#pragma unroll 8
-	for (int j = 0; j<8; j++) {
-		A[j] = R[j];
-	}
-}
-void inline STEP8_MAJ_22(const uint *w, const int r, const int s, uint * A, const uint * B, const uint * C, uint * D)
-{
-	uint temp;
-	uint R[8];
-#pragma unroll 8
-	for (int j = 0; j<8; j++) {
-		R[j] = ROTL32(A[j], r);
-	}
-	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
-	D[0] = ROTL32(temp, s) + R[6];
-	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
-	D[1] = ROTL32(temp, s) + R[7];
-	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
-	D[2] = ROTL32(temp, s) + R[4];
-	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
-	D[3] = ROTL32(temp, s) + R[5];
-	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
-	D[4] = ROTL32(temp, s) + R[2];
-	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
-	D[5] = ROTL32(temp, s) + R[3];
-	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
-	D[6] = ROTL32(temp, s) + R[0];
-	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
-	D[7] = ROTL32(temp, s) + R[1];
-#pragma unroll 8
-	for (int j = 0; j<8; j++) {
-		A[j] = R[j];
-	}
-}
-void inline STEP8_MAJ_23(const uint *w, const int r, const int s, uint * A, const uint * B, const uint * C, uint * D)
-{
-	uint temp;
-	uint R[8];
-#pragma unroll 8
-	for (int j = 0; j<8; j++) {
-		R[j] = ROTL32(A[j], r);
-	}
-	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
-	D[0] = ROTL32(temp, s) + R[2];
-	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
-	D[1] = ROTL32(temp, s) + R[3];
-	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
-	D[2] = ROTL32(temp, s) + R[0];
-	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
-	D[3] = ROTL32(temp, s) + R[1];
-	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
-	D[4] = ROTL32(temp, s) + R[6];
-	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
-	D[5] = ROTL32(temp, s) + R[7];
-	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
-	D[6] = ROTL32(temp, s) + R[4];
-	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
-	D[7] = ROTL32(temp, s) + R[5];
-#pragma unroll 8
-	for (int j = 0; j<8; j++) {
-		A[j] = R[j];
-	}
-}
-void inline STEP8_IF_24(const uint *w, const int r, const int s, uint * A, const uint * B, const uint * C, uint * D)
-{
-	uint temp;
-	uint R[8];
-#pragma unroll 8
-	for (int j = 0; j<8; j++) {
-		R[j] = ROTL32(A[j], r);
-	}
-	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
-	D[0] = ROTL32(temp, s) + R[3];
-	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
-	D[1] = ROTL32(temp, s) + R[2];
-	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
-	D[2] = ROTL32(temp, s) + R[1];
-	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
-	D[3] = ROTL32(temp, s) + R[0];
-	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
-	D[4] = ROTL32(temp, s) + R[7];
-	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
-	D[5] = ROTL32(temp, s) + R[6];
-	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
-	D[6] = ROTL32(temp, s) + R[5];
-	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
-	D[7] = ROTL32(temp, s) + R[4];
-#pragma unroll 8
-	for (int j = 0; j<8; j++) {
-		A[j] = R[j];
-	}
-}
-void inline STEP8_IF_25(const uint *w, const int r, const int s, uint * A, const uint * B, const uint * C, uint * D)
-{
-	uint temp;
-	uint R[8];
-#pragma unroll 8
-	for (int j = 0; j<8; j++) {
-		R[j] = ROTL32(A[j], r);
-	}
-	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
-	D[0] = ROTL32(temp, s) + R[5];
-	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
-	D[1] = ROTL32(temp, s) + R[4];
-	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
-	D[2] = ROTL32(temp, s) + R[7];
-	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
-	D[3] = ROTL32(temp, s) + R[6];
-	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
-	D[4] = ROTL32(temp, s) + R[1];
-	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
-	D[5] = ROTL32(temp, s) + R[0];
-	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
-	D[6] = ROTL32(temp, s) + R[3];
-	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
-	D[7] = ROTL32(temp, s) + R[2];
-#pragma unroll 8
-	for (int j = 0; j<8; j++) {
-		A[j] = R[j];
-	}
-}
-void inline STEP8_IF_26(const uint *w, const int r, const int s, uint * A, const uint * B, const uint * C, uint * D)
-{
-	uint temp;
-	uint R[8];
-#pragma unroll 8
-	for (int j = 0; j<8; j++) {
-		R[j] = ROTL32(A[j], r);
-	}
-	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
-	D[0] = ROTL32(temp, s) + R[7];
-	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
-	D[1] = ROTL32(temp, s) + R[6];
-	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
-	D[2] = ROTL32(temp, s) + R[5];
-	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
-	D[3] = ROTL32(temp, s) + R[4];
-	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
-	D[4] = ROTL32(temp, s) + R[3];
-	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
-	D[5] = ROTL32(temp, s) + R[2];
-	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
-	D[6] = ROTL32(temp, s) + R[1];
-	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
-	D[7] = ROTL32(temp, s) + R[0];
-#pragma unroll 8
-	for (int j = 0; j<8; j++) {
-		A[j] = R[j];
-	}
-}
-void inline STEP8_IF_27(const uint *w, const int r, const int s, uint * A, const uint * B, const uint * C, uint * D)
+
+#define p8_xor(x) ( ((x)%7) == 0 ? 1 : \
+	((x)%7) == 1 ? 6 : \
+	((x)%7) == 2 ? 2 : \
+	((x)%7) == 3 ? 3 : \
+	((x)%7) == 4 ? 5 : \
+	((x)%7) == 5 ? 7 : 4 )
+
+static inline void STEP8_IF_G(__constant uint * w, const uint i, const uint r, const uint s, uint *const A, const uint *const B, const uint *const C, uint *const D)
 {
-	uint temp;
 	uint R[8];
+
 #pragma unroll 8
-	for (int j = 0; j<8; j++) {
+	for (int j = 0; j<8; j++)
 		R[j] = ROTL32(A[j], r);
-	}
-	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
-	D[0] = ROTL32(temp, s) + R[4];
-	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
-	D[1] = ROTL32(temp, s) + R[5];
-	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
-	D[2] = ROTL32(temp, s) + R[6];
-	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
-	D[3] = ROTL32(temp, s) + R[7];
-	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
-	D[4] = ROTL32(temp, s) + R[0];
-	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
-	D[5] = ROTL32(temp, s) + R[1];
-	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
-	D[6] = ROTL32(temp, s) + R[2];
-	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
-	D[7] = ROTL32(temp, s) + R[3];
+
+	uint W[8];
+	*(uint8*)&W[0] = *(__constant uint8*)&w[0];
 #pragma unroll 8
-	for (int j = 0; j<8; j++) {
-		A[j] = R[j];
-	}
-}
-void inline STEP8_MAJ_28(const uint *w, const int r, const int s, uint * A, const uint * B, const uint * C, uint * D)
-{
-	uint temp;
-	uint R[8];
+	for (int j = 0; j<8; j++)
+		D[j] += W[j] + IF(A[j], B[j], C[j]);
 #pragma unroll 8
-	for (int j = 0; j<8; j++) {
-		R[j] = ROTL32(A[j], r);
-	}
-	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
-	D[0] = ROTL32(temp, s) + R[1];
-	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
-	D[1] = ROTL32(temp, s) + R[0];
-	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
-	D[2] = ROTL32(temp, s) + R[3];
-	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
-	D[3] = ROTL32(temp, s) + R[2];
-	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
-	D[4] = ROTL32(temp, s) + R[5];
-	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
-	D[5] = ROTL32(temp, s) + R[4];
-	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
-	D[6] = ROTL32(temp, s) + R[7];
-	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
-	D[7] = ROTL32(temp, s) + R[6];
+	for (int j = 0; j<8; j++)
+		D[j] = R[j^p8_xor(i)] + ROTL32(D[j], s);
 #pragma unroll 8
-	for (int j = 0; j<8; j++) {
+	for (int j = 0; j<8; j++)
 		A[j] = R[j];
-	}
 }
-void inline STEP8_MAJ_29(const uint *w, const int r, const int s, uint * A, const uint * B, const uint * C, uint * D)
+
+static void inline STEP8_MAJ_G(__constant uint * w, const uint i, const uint r, const uint s, uint *const A, const uint *const B, const uint *const C, uint *const D)
 {
-	uint temp;
 	uint R[8];
+
+	uint W[8];
+	*(uint8*)&W[0] = *(__constant uint8*)&w[0];
+
 #pragma unroll 8
-	for (int j = 0; j<8; j++) {
+	for (int j = 0; j<8; j++)
 		R[j] = ROTL32(A[j], r);
-	}
-	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
-	D[0] = ROTL32(temp, s) + R[6];
-	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
-	D[1] = ROTL32(temp, s) + R[7];
-	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
-	D[2] = ROTL32(temp, s) + R[4];
-	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
-	D[3] = ROTL32(temp, s) + R[5];
-	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
-	D[4] = ROTL32(temp, s) + R[2];
-	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
-	D[5] = ROTL32(temp, s) + R[3];
-	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
-	D[6] = ROTL32(temp, s) + R[0];
-	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
-	D[7] = ROTL32(temp, s) + R[1];
+
 #pragma unroll 8
-	for (int j = 0; j<8; j++) {
-		A[j] = R[j];
-	}
-}
-void inline STEP8_MAJ_30(const uint *w, const int r, const int s, uint * A, const uint * B, const uint * C, uint * D)
-{
-	uint temp;
-	uint R[8];
+	for (int j = 0; j<8; j++)
+		D[j] += W[j] + MAJ(A[j], B[j], C[j]);
 #pragma unroll 8
-	for (int j = 0; j<8; j++) {
-		R[j] = ROTL32(A[j], r);
-	}
-	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
-	D[0] = ROTL32(temp, s) + R[2];
-	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
-	D[1] = ROTL32(temp, s) + R[3];
-	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
-	D[2] = ROTL32(temp, s) + R[0];
-	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
-	D[3] = ROTL32(temp, s) + R[1];
-	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
-	D[4] = ROTL32(temp, s) + R[6];
-	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
-	D[5] = ROTL32(temp, s) + R[7];
-	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
-	D[6] = ROTL32(temp, s) + R[4];
-	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
-	D[7] = ROTL32(temp, s) + R[5];
+	for (int j = 0; j<8; j++)
+		D[j] = R[j^p8_xor(i)] + ROTL32(D[j], s);
 #pragma unroll 8
-	for (int j = 0; j<8; j++) {
+	for (int j = 0; j<8; j++)
 		A[j] = R[j];
-	}
 }
-void inline STEP8_MAJ_31(const uint *w, const int r, const int s, uint * A, const uint * B, const uint * C, uint * D)
+
+static inline void STEP8_IF(const uint *const w, const uint i, const uint r, const uint s, uint *const A, const uint *const B, const uint *const C, uint *const D)
 {
-	uint temp;
 	uint R[8];
+
 #pragma unroll 8
-	for (int j = 0; j<8; j++) {
+	for (int j = 0; j<8; j++)
 		R[j] = ROTL32(A[j], r);
-	}
-	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
-	D[0] = ROTL32(temp, s) + R[3];
-	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
-	D[1] = ROTL32(temp, s) + R[2];
-	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
-	D[2] = ROTL32(temp, s) + R[1];
-	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
-	D[3] = ROTL32(temp, s) + R[0];
-	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
-	D[4] = ROTL32(temp, s) + R[7];
-	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
-	D[5] = ROTL32(temp, s) + R[6];
-	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
-	D[6] = ROTL32(temp, s) + R[5];
-	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
-	D[7] = ROTL32(temp, s) + R[4];
+
+	uint W[8];
+	*(uint8*)&W[0] = *(uint8*)&w[0];
 #pragma unroll 8
-	for (int j = 0; j<8; j++) {
-		A[j] = R[j];
-	}
-}
-void inline STEP8_IF_32(const uint *w, const int r, const int s, uint * A, const uint * B, const uint * C, uint * D)
-{
-	uint temp;
-	uint R[8];
+	for (int j = 0; j<8; j++)
+		D[j] += W[j] + IF(A[j], B[j], C[j]);
 #pragma unroll 8
-	for (int j = 0; j<8; j++) {
-		R[j] = ROTL32(A[j], r);
-	}
-	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
-	D[0] = ROTL32(temp, s) + R[5];
-	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
-	D[1] = ROTL32(temp, s) + R[4];
-	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
-	D[2] = ROTL32(temp, s) + R[7];
-	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
-	D[3] = ROTL32(temp, s) + R[6];
-	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
-	D[4] = ROTL32(temp, s) + R[1];
-	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
-	D[5] = ROTL32(temp, s) + R[0];
-	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
-	D[6] = ROTL32(temp, s) + R[3];
-	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
-	D[7] = ROTL32(temp, s) + R[2];
+	for (int j = 0; j<8; j++)
+		D[j] = R[j^p8_xor(i)] + ROTL32(D[j], s);
 #pragma unroll 8
-	for (int j = 0; j<8; j++) {
+	for (int j = 0; j<8; j++)
 		A[j] = R[j];
-	}
 }
-void inline STEP8_IF_33(const uint *w, const int r, const int s, uint * A, const uint * B, const uint * C, uint * D)
+
+static inline void STEP8_MAJ(const uint *const w, const uint i, const uint r, const uint s, uint *const A, const uint *const B, const uint *const C, uint *const D)
 {
-	uint temp;
 	uint R[8];
+
+	uint W[8];
+	*(uint8*)&W[0] = *(uint8*)&w[0];
+
 #pragma unroll 8
-	for (int j = 0; j<8; j++) {
+	for (int j = 0; j<8; j++)
 		R[j] = ROTL32(A[j], r);
-	}
-	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
-	D[0] = ROTL32(temp, s) + R[7];
-	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
-	D[1] = ROTL32(temp, s) + R[6];
-	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
-	D[2] = ROTL32(temp, s) + R[5];
-	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
-	D[3] = ROTL32(temp, s) + R[4];
-	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
-	D[4] = ROTL32(temp, s) + R[3];
-	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
-	D[5] = ROTL32(temp, s) + R[2];
-	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
-	D[6] = ROTL32(temp, s) + R[1];
-	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
-	D[7] = ROTL32(temp, s) + R[0];
+
 #pragma unroll 8
-	for (int j = 0; j<8; j++) {
-		A[j] = R[j];
-	}
-}
-void inline STEP8_IF_34(const uint *w, const int r, const int s, uint * A, const uint * B, const uint * C, uint * D)
-{
-	uint temp;
-	uint R[8];
+	for (int j = 0; j<8; j++)
+		D[j] += W[j] + MAJ(A[j], B[j], C[j]);
 #pragma unroll 8
-	for (int j = 0; j<8; j++) {
-		R[j] = ROTL32(A[j], r);
-	}
-	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
-	D[0] = ROTL32(temp, s) + R[4];
-	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
-	D[1] = ROTL32(temp, s) + R[5];
-	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
-	D[2] = ROTL32(temp, s) + R[6];
-	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
-	D[3] = ROTL32(temp, s) + R[7];
-	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
-	D[4] = ROTL32(temp, s) + R[0];
-	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
-	D[5] = ROTL32(temp, s) + R[1];
-	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
-	D[6] = ROTL32(temp, s) + R[2];
-	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
-	D[7] = ROTL32(temp, s) + R[3];
+	for (int j = 0; j<8; j++)
+		D[j] = R[j^p8_xor(i)] + ROTL32(D[j], s);
 #pragma unroll 8
-	for (int j = 0; j<8; j++) {
+	for (int j = 0; j<8; j++)
 		A[j] = R[j];
-	}
-}
-void inline STEP8_IF_35(const uint *w, const int r, const int s, uint * A, const uint * B, const uint * C, uint * D)
-{
-	uint temp;
-	uint R[8];
-#pragma unroll 8
-	for (int j = 0; j<8; j++) {
-		R[j] = ROTL32(A[j], r);
-	}
-	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
-	D[0] = ROTL32(temp, s) + R[1];
-	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
-	D[1] = ROTL32(temp, s) + R[0];
-	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
-	D[2] = ROTL32(temp, s) + R[3];
-	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
-	D[3] = ROTL32(temp, s) + R[2];
-	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
-	D[4] = ROTL32(temp, s) + R[5];
-	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
-	D[5] = ROTL32(temp, s) + R[4];
-	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
-	D[6] = ROTL32(temp, s) + R[7];
-	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
-	D[7] = ROTL32(temp, s) + R[6];
-#pragma unroll 8
-	for (int j = 0; j<8; j++) {
-		A[j] = R[j];
-	}
 }
 
- 
+__constant uint d_cw[4][8][8] = {
+	0x531B1720, 0xAC2CDE09, 0x0B902D87, 0x2369B1F4, 0x2931AA01, 0x02E4B082, 0xC914C914, 0xC1DAE1A6, 0xF18C2B5C, 0x08AC306B, 0x27BFC914, 0xCEDC548D, 0xC630C4BE, 0xF18C4335, 0xF0D3427C, 0xBE3DA380,
+	0x143C02E4, 0xA948C630, 0xA4F2DE09, 0xA71D2085, 0xA439BD84, 0x109FCD6A, 0xEEA8EF61, 0xA5AB1CE8, 0x0B90D4A4, 0x3D6D039D, 0x25944D53, 0xBAA0E034, 0x5BC71E5A, 0xB1F4F2FE, 0x12CADE09, 0x548D41C3,
+	0x3CB4F80D, 0x36ECEBC4, 0xA66443EE, 0x43351ABD, 0xC7A20C49, 0xEB0BB366, 0xF5293F98, 0x49B6DE09, 0x531B29EA, 0x02E402E4, 0xDB25C405, 0x53D4E543, 0x0AD71720, 0xE1A61A04, 0xB87534C1, 0x3EDF43EE,
+	0x213E50F0, 0x39173EDF, 0xA9485B0E, 0xEEA82EF9, 0x14F55771, 0xFAF15546, 0x3D6DD9B3, 0xAB73B92E, 0x582A48FD, 0xEEA81892, 0x4F7EAA01, 0xAF10A88F, 0x11581720, 0x34C124DB, 0xD1C0AB73, 0x1E5AF0D3,
+	0xC34C07F3, 0xC914143C, 0x599CBC12, 0xBCCBE543, 0x385EF3B7, 0x14F54C9A, 0x0AD7C068, 0xB64A21F7, 0xDEC2AF10, 0xC6E9C121, 0x56B8A4F2, 0x1158D107, 0xEB0BA88F, 0x050FAABA, 0xC293264D, 0x548D46D2,
+	0xACE5E8E0, 0x53D421F7, 0xF470D279, 0xDC974E0C, 0xD6CF55FF, 0xFD1C4F7E, 0x36EC36EC, 0x3E261E5A, 0xEBC4FD1C, 0x56B839D0, 0x5B0E21F7, 0x58E3DF7B, 0x5BC7427C, 0xEF613296, 0x1158109F, 0x5A55E318,
+	0xA7D6B703, 0x1158E76E, 0xB08255FF, 0x50F05771, 0xEEA8E8E0, 0xCB3FDB25, 0x2E40548D, 0xE1A60F2D, 0xACE5D616, 0xFD1CFD1C, 0x24DB3BFB, 0xAC2C1ABD, 0xF529E8E0, 0x1E5AE5FC, 0x478BCB3F, 0xC121BC12,
+	0xF4702B5C, 0xC293FC63, 0xDA6CB2AD, 0x45601FCC, 0xA439E1A6, 0x4E0C0D02, 0xED3621F7, 0xAB73BE3D, 0x0E74D4A4, 0xF754CF95, 0xD84136EC, 0x3124AB73, 0x39D03B42, 0x0E74BCCB, 0x0F2DBD84, 0x41C35C80,
+	0xA4135BED, 0xE10E1EF2, 0x6C4F93B1, 0x6E2191DF, 0xE2E01D20, 0xD1952E6B, 0x6A7D9583, 0x131DECE3, 0x369CC964, 0xFB73048D, 0x9E9D6163, 0x280CD7F4, 0xD9C6263A, 0x1062EF9E, 0x2AC7D539, 0xAD2D52D3,
+	0x0A03F5FD, 0x197CE684, 0xAA72558E, 0xDE5321AD, 0xF0870F79, 0x607A9F86, 0xAFE85018, 0x2AC7D539, 0xE2E01D20, 0x2AC7D539, 0xC6A93957, 0x624C9DB4, 0x6C4F93B1, 0x641E9BE2, 0x452CBAD4, 0x263AD9C6,
+	0xC964369C, 0xC3053CFB, 0x452CBAD4, 0x95836A7D, 0x4AA2B55E, 0xAB5B54A5, 0xAC4453BC, 0x74808B80, 0xCB3634CA, 0xFC5C03A4, 0x4B8BB475, 0x21ADDE53, 0xE2E01D20, 0xDF3C20C4, 0xBD8F4271, 0xAA72558E,
+	0xFC5C03A4, 0x48D0B730, 0x2AC7D539, 0xD70B28F5, 0x53BCAC44, 0x3FB6C04A, 0x14EFEB11, 0xDB982468, 0x9A1065F0, 0xB0D14F2F, 0x8D5272AE, 0xC4D73B29, 0x91DF6E21, 0x949A6B66, 0x303DCFC3, 0x5932A6CE,
+	0x1234EDCC, 0xF5140AEC, 0xCDF1320F, 0x3DE4C21C, 0x48D0B730, 0x1234EDCC, 0x131DECE3, 0x52D3AD2D, 0xE684197C, 0x6D3892C8, 0x72AE8D52, 0x6FF3900D, 0x73978C69, 0xEB1114EF, 0x15D8EA28, 0x71C58E3B,
+	0x90F66F0A, 0x15D8EA28, 0x9BE2641E, 0x65F09A10, 0xEA2815D8, 0xBD8F4271, 0x3A40C5C0, 0xD9C6263A, 0xB38C4C74, 0xBAD4452C, 0x70DC8F24, 0xAB5B54A5, 0x46FEB902, 0x1A65E59B, 0x0DA7F259, 0xA32A5CD6,
+	0xD62229DE, 0xB81947E7, 0x6D3892C8, 0x15D8EA28, 0xE59B1A65, 0x065FF9A1, 0xB2A34D5D, 0x6A7D9583, 0x975568AB, 0xFC5C03A4, 0x2E6BD195, 0x966C6994, 0xF2590DA7, 0x263AD9C6, 0x5A1BA5E5, 0xB0D14F2F,
+	0x975568AB, 0x6994966C, 0xF1700E90, 0xD3672C99, 0xCC1F33E1, 0xFC5C03A4, 0x452CBAD4, 0x4E46B1BA, 0xF1700E90, 0xB2A34D5D, 0xD0AC2F54, 0x5760A8A0, 0x8C697397, 0x624C9DB4, 0xE85617AA, 0x95836A7D
 
-void inline Round8_0_final(uint *A, int r, int s, int t, int u)
-{
-     const uint d_cw0[8][8] = {
-	0x531B1720, 0xAC2CDE09, 0x0B902D87, 0x2369B1F4, 0x2931AA01, 0x02E4B082, 0xC914C914, 0xC1DAE1A6,
-	0xF18C2B5C, 0x08AC306B, 0x27BFC914, 0xCEDC548D, 0xC630C4BE, 0xF18C4335, 0xF0D3427C, 0xBE3DA380,
-	0x143C02E4, 0xA948C630, 0xA4F2DE09, 0xA71D2085, 0xA439BD84, 0x109FCD6A, 0xEEA8EF61, 0xA5AB1CE8,
-	0x0B90D4A4, 0x3D6D039D, 0x25944D53, 0xBAA0E034, 0x5BC71E5A, 0xB1F4F2FE, 0x12CADE09, 0x548D41C3,
-	0x3CB4F80D, 0x36ECEBC4, 0xA66443EE, 0x43351ABD, 0xC7A20C49, 0xEB0BB366, 0xF5293F98, 0x49B6DE09,
-	0x531B29EA, 0x02E402E4, 0xDB25C405, 0x53D4E543, 0x0AD71720, 0xE1A61A04, 0xB87534C1, 0x3EDF43EE,
-	0x213E50F0, 0x39173EDF, 0xA9485B0E, 0xEEA82EF9, 0x14F55771, 0xFAF15546, 0x3D6DD9B3, 0xAB73B92E,
-	0x582A48FD, 0xEEA81892, 0x4F7EAA01, 0xAF10A88F, 0x11581720, 0x34C124DB, 0xD1C0AB73, 0x1E5AF0D3
 };
-	STEP8_IF_0(d_cw0[0], r, s, A, &A[8], &A[16], &A[24]);
-	STEP8_IF_1(d_cw0[1], s, t, &A[24], A, &A[8], &A[16]);
-	STEP8_IF_2(d_cw0[2], t, u, &A[16], &A[24], A, &A[8]);
-	STEP8_IF_3(d_cw0[3], u, r, &A[8], &A[16], &A[24], A);
-	STEP8_MAJ_4(d_cw0[4], r, s, A, &A[8], &A[16], &A[24]);
-	STEP8_MAJ_5(d_cw0[5], s, t, &A[24], A, &A[8], &A[16]);
-	STEP8_MAJ_6(d_cw0[6], t, u, &A[16], &A[24], A, &A[8]);
-	STEP8_MAJ_7(d_cw0[7], u, r, &A[8], &A[16], &A[24], A);
+
+static inline void Round8_0_final(uint *const  A, const uint r, const  uint s, const uint t, const uint u){
+
+	STEP8_IF_G(d_cw[0][0], 0, r, s, A, &A[8], &A[16], &A[24]);
+	STEP8_IF_G(d_cw[0][1], 1, s, t, &A[24], A, &A[8], &A[16]);
+	STEP8_IF_G(d_cw[0][2], 2, t, u, &A[16], &A[24], A, &A[8]);
+	STEP8_IF_G(d_cw[0][3], 3, u, r, &A[8], &A[16], &A[24], A);
+	STEP8_MAJ_G(d_cw[0][4], 4, r, s, A, &A[8], &A[16], &A[24]);
+	STEP8_MAJ_G(d_cw[0][5], 5, s, t, &A[24], A, &A[8], &A[16]);
+	STEP8_MAJ_G(d_cw[0][6], 6, t, u, &A[16], &A[24], A, &A[8]);
+	STEP8_MAJ_G(d_cw[0][7], 7, u, r, &A[8], &A[16], &A[24], A);
 }
 
-void inline Round8_1_final(uint *A, int r, int s, int t, int u)
-{
-     const uint d_cw1[8][8] = {
-	0xC34C07F3, 0xC914143C, 0x599CBC12, 0xBCCBE543, 0x385EF3B7, 0x14F54C9A, 0x0AD7C068, 0xB64A21F7,
-	0xDEC2AF10, 0xC6E9C121, 0x56B8A4F2, 0x1158D107, 0xEB0BA88F, 0x050FAABA, 0xC293264D, 0x548D46D2,
-	0xACE5E8E0, 0x53D421F7, 0xF470D279, 0xDC974E0C, 0xD6CF55FF, 0xFD1C4F7E, 0x36EC36EC, 0x3E261E5A,
-	0xEBC4FD1C, 0x56B839D0, 0x5B0E21F7, 0x58E3DF7B, 0x5BC7427C, 0xEF613296, 0x1158109F, 0x5A55E318,
-	0xA7D6B703, 0x1158E76E, 0xB08255FF, 0x50F05771, 0xEEA8E8E0, 0xCB3FDB25, 0x2E40548D, 0xE1A60F2D,
-	0xACE5D616, 0xFD1CFD1C, 0x24DB3BFB, 0xAC2C1ABD, 0xF529E8E0, 0x1E5AE5FC, 0x478BCB3F, 0xC121BC12,
-	0xF4702B5C, 0xC293FC63, 0xDA6CB2AD, 0x45601FCC, 0xA439E1A6, 0x4E0C0D02, 0xED3621F7, 0xAB73BE3D,
-	0x0E74D4A4, 0xF754CF95, 0xD84136EC, 0x3124AB73, 0x39D03B42, 0x0E74BCCB, 0x0F2DBD84, 0x41C35C80
-};
-	STEP8_IF_8(d_cw1[0], r, s, A, &A[8], &A[16], &A[24]);
-	STEP8_IF_9(d_cw1[1], s, t, &A[24], A, &A[8], &A[16]);
-	STEP8_IF_10(d_cw1[2], t, u, &A[16], &A[24], A, &A[8]);
-	STEP8_IF_11(d_cw1[3], u, r, &A[8], &A[16], &A[24], A);
-	STEP8_MAJ_12(d_cw1[4], r, s, A, &A[8], &A[16], &A[24]);
-	STEP8_MAJ_13(d_cw1[5], s, t, &A[24], A, &A[8], &A[16]);
-	STEP8_MAJ_14(d_cw1[6], t, u, &A[16], &A[24], A, &A[8]);
-	STEP8_MAJ_15(d_cw1[7], u, r, &A[8], &A[16], &A[24], A);
+static inline void Round8_1_final(uint *const  A, const uint r, const  uint s, const uint t, const uint u){
+
+	STEP8_IF_G(d_cw[1][0], 8, r, s, A, &A[8], &A[16], &A[24]);
+	STEP8_IF_G(d_cw[1][1], 9, s, t, &A[24], A, &A[8], &A[16]);
+	STEP8_IF_G(d_cw[1][2], 10, t, u, &A[16], &A[24], A, &A[8]);
+	STEP8_IF_G(d_cw[1][3], 11, u, r, &A[8], &A[16], &A[24], A);
+	STEP8_MAJ_G(d_cw[1][4], 12, r, s, A, &A[8], &A[16], &A[24]);
+	STEP8_MAJ_G(d_cw[1][5], 13, s, t, &A[24], A, &A[8], &A[16]);
+	STEP8_MAJ_G(d_cw[1][6], 14, t, u, &A[16], &A[24], A, &A[8]);
+	STEP8_MAJ_G(d_cw[1][7], 15, u, r, &A[8], &A[16], &A[24], A);
 }
 
+static inline void Round8_2_final(uint *const  A, const uint r, const  uint s, const uint t, const uint u){
 
-void inline Round8_2_final(uint *A, int r, int s, int t, int u)
-{
-      const uint d_cw2[8][8] = {
-	0xA4135BED, 0xE10E1EF2, 0x6C4F93B1, 0x6E2191DF, 0xE2E01D20, 0xD1952E6B, 0x6A7D9583, 0x131DECE3,
-	0x369CC964, 0xFB73048D, 0x9E9D6163, 0x280CD7F4, 0xD9C6263A, 0x1062EF9E, 0x2AC7D539, 0xAD2D52D3,
-	0x0A03F5FD, 0x197CE684, 0xAA72558E, 0xDE5321AD, 0xF0870F79, 0x607A9F86, 0xAFE85018, 0x2AC7D539,
-	0xE2E01D20, 0x2AC7D539, 0xC6A93957, 0x624C9DB4, 0x6C4F93B1, 0x641E9BE2, 0x452CBAD4, 0x263AD9C6,
-	0xC964369C, 0xC3053CFB, 0x452CBAD4, 0x95836A7D, 0x4AA2B55E, 0xAB5B54A5, 0xAC4453BC, 0x74808B80,
-	0xCB3634CA, 0xFC5C03A4, 0x4B8BB475, 0x21ADDE53, 0xE2E01D20, 0xDF3C20C4, 0xBD8F4271, 0xAA72558E,
-	0xFC5C03A4, 0x48D0B730, 0x2AC7D539, 0xD70B28F5, 0x53BCAC44, 0x3FB6C04A, 0x14EFEB11, 0xDB982468,
-	0x9A1065F0, 0xB0D14F2F, 0x8D5272AE, 0xC4D73B29, 0x91DF6E21, 0x949A6B66, 0x303DCFC3, 0x5932A6CE
-};
-	STEP8_IF_16(d_cw2[0], r, s, A, &A[8], &A[16], &A[24]);
-	STEP8_IF_17(d_cw2[1], s, t, &A[24], A, &A[8], &A[16]);
-	STEP8_IF_18(d_cw2[2], t, u, &A[16], &A[24], A, &A[8]);
-	STEP8_IF_19(d_cw2[3], u, r, &A[8], &A[16], &A[24], A);
-	STEP8_MAJ_20(d_cw2[4], r, s, A, &A[8], &A[16], &A[24]);
-	STEP8_MAJ_21(d_cw2[5], s, t, &A[24], A, &A[8], &A[16]);
-	STEP8_MAJ_22(d_cw2[6], t, u, &A[16], &A[24], A, &A[8]);
-	STEP8_MAJ_23(d_cw2[7], u, r, &A[8], &A[16], &A[24], A);
+	STEP8_IF_G(d_cw[2][0], 16, r, s, A, &A[8], &A[16], &A[24]);
+	STEP8_IF_G(d_cw[2][1], 17, s, t, &A[24], A, &A[8], &A[16]);
+	STEP8_IF_G(d_cw[2][2], 18, t, u, &A[16], &A[24], A, &A[8]);
+	STEP8_IF_G(d_cw[2][3], 19, u, r, &A[8], &A[16], &A[24], A);
+	STEP8_MAJ_G(d_cw[2][4], 20, r, s, A, &A[8], &A[16], &A[24]);
+	STEP8_MAJ_G(d_cw[2][5], 21, s, t, &A[24], A, &A[8], &A[16]);
+	STEP8_MAJ_G(d_cw[2][6], 22, t, u, &A[16], &A[24], A, &A[8]);
+	STEP8_MAJ_G(d_cw[2][7], 23, u, r, &A[8], &A[16], &A[24], A);
 }
 
- 
+static inline void Round8_3_final(uint*const  A, const uint r, const  uint s, const uint t, const uint u){
 
-void inline Round8_3_final(uint *A, int r, int s, int t, int u)
-{
-     const uint d_cw3[8][8] = {
-	0x1234EDCC, 0xF5140AEC, 0xCDF1320F, 0x3DE4C21C, 0x48D0B730, 0x1234EDCC, 0x131DECE3, 0x52D3AD2D,
-	0xE684197C, 0x6D3892C8, 0x72AE8D52, 0x6FF3900D, 0x73978C69, 0xEB1114EF, 0x15D8EA28, 0x71C58E3B,
-	0x90F66F0A, 0x15D8EA28, 0x9BE2641E, 0x65F09A10, 0xEA2815D8, 0xBD8F4271, 0x3A40C5C0, 0xD9C6263A,
-	0xB38C4C74, 0xBAD4452C, 0x70DC8F24, 0xAB5B54A5, 0x46FEB902, 0x1A65E59B, 0x0DA7F259, 0xA32A5CD6,
-	0xD62229DE, 0xB81947E7, 0x6D3892C8, 0x15D8EA28, 0xE59B1A65, 0x065FF9A1, 0xB2A34D5D, 0x6A7D9583,
-	0x975568AB, 0xFC5C03A4, 0x2E6BD195, 0x966C6994, 0xF2590DA7, 0x263AD9C6, 0x5A1BA5E5, 0xB0D14F2F,
-	0x975568AB, 0x6994966C, 0xF1700E90, 0xD3672C99, 0xCC1F33E1, 0xFC5C03A4, 0x452CBAD4, 0x4E46B1BA,
-	0xF1700E90, 0xB2A34D5D, 0xD0AC2F54, 0x5760A8A0, 0x8C697397, 0x624C9DB4, 0xE85617AA, 0x95836A7D
-};
-	STEP8_IF_24(d_cw3[0], r, s, A, &A[8], &A[16], &A[24]);
-	STEP8_IF_25(d_cw3[1], s, t, &A[24], A, &A[8], &A[16]);
-	STEP8_IF_26(d_cw3[2], t, u, &A[16], &A[24], A, &A[8]);
-	STEP8_IF_27(d_cw3[3], u, r, &A[8], &A[16], &A[24], A);
-	STEP8_MAJ_28(d_cw3[4], r, s, A, &A[8], &A[16], &A[24]);
-	STEP8_MAJ_29(d_cw3[5], s, t, &A[24], A, &A[8], &A[16]);
-	STEP8_MAJ_30(d_cw3[6], t, u, &A[16], &A[24], A, &A[8]);
-	STEP8_MAJ_31(d_cw3[7], u, r, &A[8], &A[16], &A[24], A);
+	STEP8_IF_G(d_cw[3][0], 24, r, s, A, &A[8], &A[16], &A[24]);
+	STEP8_IF_G(d_cw[3][1], 25, s, t, &A[24], A, &A[8], &A[16]);
+	STEP8_IF_G(d_cw[3][2], 26, t, u, &A[16], &A[24], A, &A[8]);
+	STEP8_IF_G(d_cw[3][3], 27, u, r, &A[8], &A[16], &A[24], A);
+	STEP8_MAJ_G(d_cw[3][4], 28, r, s, A, &A[8], &A[16], &A[24]);
+	STEP8_MAJ_G(d_cw[3][5], 29, s, t, &A[24], A, &A[8], &A[16]);
+	STEP8_MAJ_G(d_cw[3][6], 30, t, u, &A[16], &A[24], A, &A[8]);
+	STEP8_MAJ_G(d_cw[3][7], 31, u, r, &A[8], &A[16], &A[24], A);
 }
 
-#define expanded_vector(x) (g_fft4[x])
+#define expanded_vector(x) g_fft4[x]
+//static __device__ __forceinline__ void expanded_vector(uint* w, const uint4* ptr){
+////	asm volatile ("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];"  : "=r"(w[0]), "=r"(w[1]), "=r"(w[2]), "=r"(w[3]) : __LDG_PTR(ptr));
+//}
+
+static inline void Round8(uint*const  A, const uint thr_offset, __global uint4 *const g_fft4) {
 
-void inline Round8_0(uint *A, const int thr_offset,
-		int r, int s, int t, int u, __global uint4 *g_fft4) {
 	uint w[8];
-    uint4 hv1, hv2;
+	uint tmp = thr_offset;
+	uint4 hv1, hv2;
+
+	uint r = 3, s = 23, t = 17, u = 27;
 
-	int tmp = 0 + thr_offset;
 	hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
 	hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
-	STEP8_IF_0(w, r, s, A, &A[8], &A[16], &A[24]);
+	STEP8_IF(w, 0, r, s, A, &A[8], &A[16], &A[24]);
 	hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
 	hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
-	STEP8_IF_1(w, s, t, &A[24], A, &A[8], &A[16]);
+	STEP8_IF(w, 1, s, t, &A[24], A, &A[8], &A[16]);
 	hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
 	hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
-	STEP8_IF_2(w, t, u, &A[16], &A[24], A, &A[8]);
+	STEP8_IF(w, 2, t, u, &A[16], &A[24], A, &A[8]);
 	hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
 	hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
-	STEP8_IF_3(w, u, r, &A[8], &A[16], &A[24], A);
-    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
-    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
-	STEP8_MAJ_4(w, r, s, A, &A[8], &A[16], &A[24]);
-    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
-    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
-	STEP8_MAJ_5(w, s, t, &A[24], A, &A[8], &A[16]);
-    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
-    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
-	STEP8_MAJ_6(w, t, u, &A[16], &A[24], A, &A[8]);
-    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
-    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
-	STEP8_MAJ_7(w, u, r, &A[8], &A[16], &A[24], A);
-
-
-}
-void inline Round8_1(uint *A, const int thr_offset,
-		int r, int s, int t, int u, __global uint4 *g_fft4) {
-	uint w[8];
-    uint4 hv1, hv2;
-
-	int tmp = 16 + thr_offset;
-    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
-    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
-	STEP8_IF_8(w, r, s, A, &A[8], &A[16], &A[24]);
-    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
-    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
-	STEP8_IF_9(w, s, t, &A[24], A, &A[8], &A[16]);
-    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
-    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
-	STEP8_IF_10(w, t, u, &A[16], &A[24], A, &A[8]);
-    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
-    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
-	STEP8_IF_11(w, u, r, &A[8], &A[16], &A[24], A);
-    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
-    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
-	STEP8_MAJ_12(w, r, s, A, &A[8], &A[16], &A[24]);
-    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
-    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
-	STEP8_MAJ_13(w, s, t, &A[24], A, &A[8], &A[16]);
-    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
-    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
-	STEP8_MAJ_14(w, t, u, &A[16], &A[24], A, &A[8]);
-    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
-    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
-	STEP8_MAJ_15(w, u, r, &A[8], &A[16], &A[24], A);
-
-
-}
-void inline Round8_2(uint *A, const int thr_offset,
-		int r, int s, int t, int u, __global uint4 *g_fft4) {
-	uint w[8];
-    uint4 hv1, hv2;
-
-	int tmp = 32 + thr_offset;
-    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
-    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
-	STEP8_IF_16(w, r, s, A, &A[8], &A[16], &A[24]);
-    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
-    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
-	STEP8_IF_17(w, s, t, &A[24], A, &A[8], &A[16]);
-    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
-    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
-	STEP8_IF_18(w, t, u, &A[16], &A[24], A, &A[8]);
-    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
-    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
-	STEP8_IF_19(w, u, r, &A[8], &A[16], &A[24], A);
-    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
-    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
-	STEP8_MAJ_20(w, r, s, A, &A[8], &A[16], &A[24]);
-    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
-    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
-	STEP8_MAJ_21(w, s, t, &A[24], A, &A[8], &A[16]);
-    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
-    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
-	STEP8_MAJ_22(w, t, u, &A[16], &A[24], A, &A[8]);
-    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
-    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
-	STEP8_MAJ_23(w, u, r, &A[8], &A[16], &A[24], A);
-
+	STEP8_IF(w, 3, u, r, &A[8], &A[16], &A[24], A);
+	hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+	hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_MAJ(w, 4, r, s, A, &A[8], &A[16], &A[24]);
+	hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+	hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_MAJ(w, 5, s, t, &A[24], A, &A[8], &A[16]);
+	hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+	hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_MAJ(w, 6, t, u, &A[16], &A[24], A, &A[8]);
+	hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+	hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_MAJ(w, 7, u, r, &A[8], &A[16], &A[24], A);
 
-}
-void inline Round8_3(uint *A, const int thr_offset,
-		int r, int s, int t, int u, __global uint4 *g_fft4) {
-	uint w[8];
-    uint4 hv1, hv2;
-
-	int tmp = 48 + thr_offset;
-    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
-    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
-	STEP8_IF_24(w, r, s, A, &A[8], &A[16], &A[24]);
-    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
-    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
-	STEP8_IF_25(w, s, t, &A[24], A, &A[8], &A[16]);
-    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
-    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
-	STEP8_IF_26(w, t, u, &A[16], &A[24], A, &A[8]);
-    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
-    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
-	STEP8_IF_27(w, u, r, &A[8], &A[16], &A[24], A);
-    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
-    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
-	STEP8_MAJ_28(w, r, s, A, &A[8], &A[16], &A[24]);
-    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
-    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
-	STEP8_MAJ_29(w, s, t, &A[24], A, &A[8], &A[16]);
-    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
-    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
-	STEP8_MAJ_30(w, t, u, &A[16], &A[24], A, &A[8]);
-    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
-    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
-	STEP8_MAJ_31(w, u, r, &A[8], &A[16], &A[24], A);
+	r = 28; s = 19; t = 22; u = 7;
+	hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+	hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_IF(w, 8, r, s, A, &A[8], &A[16], &A[24]);
+	hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+	hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_IF(w, 9, s, t, &A[24], A, &A[8], &A[16]);
+	hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+	hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_IF(w, 10, t, u, &A[16], &A[24], A, &A[8]);
+	hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+	hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_IF(w, 11, u, r, &A[8], &A[16], &A[24], A);
+	hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+	hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_MAJ(w, 12, r, s, A, &A[8], &A[16], &A[24]);
+	hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+	hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_MAJ(w, 13, s, t, &A[24], A, &A[8], &A[16]);
+	hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+	hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_MAJ(w, 14, t, u, &A[16], &A[24], A, &A[8]);
+	hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+	hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_MAJ(w, 15, u, r, &A[8], &A[16], &A[24], A);
 
+	r = 29; s = 9; t = 15; u = 5;
 
-}
+	hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+	hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_IF(w, 16, r, s, A, &A[8], &A[16], &A[24]);
+	hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+	hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_IF(w, 17, s, t, &A[24], A, &A[8], &A[16]);
+	hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+	hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_IF(w, 18, t, u, &A[16], &A[24], A, &A[8]);
+	hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+	hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_IF(w, 19, u, r, &A[8], &A[16], &A[24], A);
+	hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+	hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_MAJ(w, 20, r, s, A, &A[8], &A[16], &A[24]);
+	hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+	hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_MAJ(w, 21, s, t, &A[24], A, &A[8], &A[16]);
+	hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+	hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_MAJ(w, 22, t, u, &A[16], &A[24], A, &A[8]);
+	hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+	hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_MAJ(w, 23, u, r, &A[8], &A[16], &A[24], A);
 
-void inline SIMD_Compress1(uint *A, const int thr_id, const uint *M, __global uint4 *g_fft4) {
-	int i;
-	const int thr_offset = thr_id << 6; // thr_id * 128 (je zwei elemente)
-#pragma unroll 8
-	for(i=0; i<8; i++) {
-		A[i] ^= M[i];
-		(&A[8])[i] ^= M[8+i];
-	}
-	Round8_0(A, thr_offset, 3, 23, 17, 27, g_fft4);
-	Round8_1(A, thr_offset, 28, 19, 22, 7, g_fft4);
-}
+	r = 4; s = 13; t = 10; u = 25;
 
-void Compression1(__global uint *hashval, const int texture_id, __global uint4 *g_fft4, __global uint *g_state) {
-	uint A[32];
-	int i;
-#pragma unroll 32
-	for (i=0; i < 32; i++) A[i] = c_IV_512[i];
-	uint buffer[16];
-#pragma unroll 16
-	for (i=0; i < 16; i++) buffer[i] = hashval[i];
-	SIMD_Compress1(A, texture_id, buffer, g_fft4);
-	__global uint *state = (__global uint*)&g_state[get_group_id(0) * (get_local_size(0)*32)];
-#pragma unroll 32
-	for (i=0; i < 32; i++) state[get_local_id(0)+get_local_size(0)*i] = A[i];
-}
+	hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+	hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_IF(w, 24, r, s, A, &A[8], &A[16], &A[24]);
+	hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+	hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_IF(w, 25, s, t, &A[24], A, &A[8], &A[16]);
+	hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+	hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_IF(w, 26, t, u, &A[16], &A[24], A, &A[8]);
+	hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+	hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_IF(w, 27, u, r, &A[8], &A[16], &A[24], A);
+	hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+	hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_MAJ(w, 28, r, s, A, &A[8], &A[16], &A[24]);
+	hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+	hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_MAJ(w, 29, s, t, &A[24], A, &A[8], &A[16]);
+	hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+	hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_MAJ(w, 30, t, u, &A[16], &A[24], A, &A[8]);
+	hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+	hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_MAJ(w, 31, u, r, &A[8], &A[16], &A[24], A);
 
-void inline SIMD_Compress2(uint *A, const int thr_id, __global uint4 *g_fft4) {
-	uint IV[4][8];
-	int i;
-	const int thr_offset = thr_id << 6; // thr_id * 128 (je zwei elemente)
-#pragma unroll 8
-	for(i=0; i<8; i++) {
-		IV[0][i] = c_IV_512[i];
-		IV[1][i] = c_IV_512[8+i];
-		IV[2][i] = c_IV_512[16+i];
-		IV[3][i] = c_IV_512[24+i];
-	}
-	Round8_2(A, thr_offset, 29, 9, 15, 5, g_fft4);
-	Round8_3(A, thr_offset, 4, 13, 10, 25, g_fft4);
-	STEP8_IF_32(IV[0],  4, 13, A, &A[8], &A[16], &A[24]);
-	STEP8_IF_33(IV[1], 13, 10, &A[24], A, &A[8], &A[16]);
-	STEP8_IF_34(IV[2], 10, 25, &A[16], &A[24], A, &A[8]);
-	STEP8_IF_35(IV[3], 25,  4, &A[8], &A[16], &A[24], A);
 }
 
-void Compression2(const int texture_id, __global uint4 *g_fft4, __global uint *g_state) {
+void x11_simd512_gpu_compress_64(__global uint *Hash, __global uint4 * g_fft4)
+{
+uint gid = get_global_id(0);
+  uint threadBloc = (gid-get_global_offset(0)) >> 3;
+  uint thr_offset = threadBloc << 6; // thr_id * 128 (je zwei elemente)
+    uint IV[32];
 	uint A[32];
-	int i;
-	__global uint *state = &g_state[get_group_id(0) * (get_local_size(0)*32)];
-#pragma unroll 32
-	for (i=0; i < 32; i++) A[i] = state[get_local_id(0)+get_local_size(0)*i];
-	SIMD_Compress2(A, texture_id, g_fft4);
-#pragma unroll 32
-	for (i=0; i < 32; i++) state[get_local_id(0)+get_local_size(0)*i] = A[i];
-}
-
-void inline SIMD_Compress_Final(uint *A, const uint *M) {
-	uint IV[4][8];
-	int i;
-#pragma unroll 8
-	for(i=0; i<8; i++) {
-		IV[0][i] = A[i];
-		IV[1][i] = (&A[8])[i];
-		IV[2][i] = (&A[16])[i];
-		IV[3][i] = (&A[24])[i];
-	}
-#pragma unroll 8
-	for(i=0; i<8; i++) {
-		A[i] ^= M[i];
-		(&A[8])[i] ^= M[8+i];
-	}
-	Round8_0_final(A, 3, 23, 17, 27);
-	Round8_1_final(A, 28, 19, 22, 7);
-	Round8_2_final(A, 29, 9, 15, 5);
-	Round8_3_final(A, 4, 13, 10, 25);
-	STEP8_IF_32(IV[0],  4, 13, A, &A[8], &A[16], &A[24]);
-	STEP8_IF_33(IV[1], 13, 10, &A[24], A, &A[8], &A[16]);
-	STEP8_IF_34(IV[2], 10, 25, &A[16], &A[24], A, &A[8]);
-	STEP8_IF_35(IV[3], 25,  4, &A[8], &A[16], &A[24], A);
-}
 
-void Final(__global uint *hashval, const int texture_id, __global uint4 *g_fft4, __global uint *g_state) {
-	uint A[32];
-	int i;
-	__global uint *state = &g_state[get_group_id(0) * (get_local_size(0)*32)];
-#pragma unroll 32
-	for (i=0; i < 32; i++) A[i] = state[get_local_id(0)+get_local_size(0)*i];
-	uint buffer[16];
-	buffer[0] = 512;
-#pragma unroll 15
-	for (i=1; i < 16; i++) buffer[i] = 0;
-	SIMD_Compress_Final(A, buffer);
-#pragma unroll 16
-	for (i=0; i < 16; i++)
-		hashval[i] = A[i];
+	*(uint8 *)&IV[ 0] = *(__constant uint8 *)&c_IV_512[ 0];
+	*(uint8 *)&IV[ 8] = *(__constant uint8 *)&c_IV_512[ 8];
+	*(uint8 *)&IV[16] = *(__constant uint8 *)&c_IV_512[16];
+	*(uint8 *)&IV[24] = *(__constant uint8 *)&c_IV_512[24];
+
+	*(uint8 *)&A[ 0] = ((__global uint8 *)Hash)[ 0];
+	*(uint8 *)&A[ 8] = ((__global uint8 *)Hash)[ 1];
+
+	#pragma unroll 16
+	for(uint i=0;i<16;i++)
+		A[ i] = A[ i] ^ IV[ i];
+
+	#pragma unroll 16
+	for(uint i=16;i<32;i++)
+		A[ i] = IV[ i];
+
+	Round8(A, thr_offset, g_fft4);
+	
+	STEP8_IF(&IV[ 0],32, 4,13,&A[ 0],&A[ 8],&A[16],&A[24]);
+	STEP8_IF(&IV[ 8],33,13,10,&A[24],&A[ 0],&A[ 8],&A[16]);
+	STEP8_IF(&IV[16],34,10,25,&A[16],&A[24],&A[ 0],&A[ 8]);
+	STEP8_IF(&IV[24],35,25, 4,&A[ 8],&A[16],&A[24],&A[ 0]);
+
+	#pragma unroll 32
+	for(uint i=0;i<32;i++){
+		IV[ i] = A[ i];
+	}
+	
+	A[ 0] ^= 512;
+
+	Round8_0_final(A, 3,23,17,27);
+	Round8_1_final(A,28,19,22, 7);
+	Round8_2_final(A,29, 9,15, 5);
+	Round8_3_final(A, 4,13,10,25);
+	STEP8_IF(&IV[ 0],32, 4,13, &A[ 0], &A[ 8], &A[16], &A[24]);
+	STEP8_IF(&IV[ 8],33,13,10, &A[24], &A[ 0], &A[ 8], &A[16]);
+	STEP8_IF(&IV[16],34,10,25, &A[16], &A[24], &A[ 0], &A[ 8]);
+	STEP8_IF(&IV[24],35,25, 4, &A[ 8], &A[16], &A[24], &A[ 0]);
+
+	*(__global uint8 *)&Hash[ 0] = *(uint8 *)&A[ 0];
+	*(__global uint8 *)&Hash[ 8] = *(uint8 *)&A[ 8];
 }
diff --git a/ocl.c b/ocl.c
index 1375088..ceeec97 100644
--- a/ocl.c
+++ b/ocl.c
@@ -1034,7 +1034,6 @@ if (algorithm->type == ALGO_YESCRYPT || algorithm->type == ALGO_YESCRYPT_NAVI) {
   }
   if (algorithm->type == ALGO_0X10_NAVI) {
     buf1size = bufsize * 16;
-    buf2size = bufsize * 2;
     applog(LOG_DEBUG, "0x10 buffer sizes: %lu RW, %lu RW", (unsigned long)bufsize, (unsigned long)bufsize);
   }
 
@@ -1123,11 +1122,6 @@ if (algorithm->type == ALGO_YESCRYPT || algorithm->type == ALGO_YESCRYPT_NAVI) {
         applog(LOG_DEBUG, "Error %d: clCreateBuffer (buffer1), decrease TC or increase LG", status);
         return NULL;
       }
-      clState->buffer2 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, buf2size, NULL, &status);
-      if (status != CL_SUCCESS && !clState->buffer1) {
-        applog(LOG_DEBUG, "Error %d: clCreateBuffer (buffer2), decrease TC or increase LG", status);
-        return NULL;
-      }
     }
     else if (algorithm->type == ALGO_PHI2 || algorithm->type == ALGO_PHI2_NAVI) {
       clState->buffer1 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, bufsize, NULL, &status);

From 91e41a44294266b9ae2b7bfec1f21a298a54c3cb Mon Sep 17 00:00:00 2001
From: Changdao Dong <dongchangdao@gmail.com>
Date: Mon, 21 Mar 2022 22:37:54 -0700
Subject: [PATCH 4/4] Issue #266

Faster simd on navi cards
---
 algorithm.c            | 10 +++-------
 driver-opencl.c        |  4 ++--
 kernel/chainox_navi.cl | 27 ++++++++-------------------
 kernel/simd_navi.cl    | 27 ++++++++++++++-------------
 ocl.c                  | 12 ------------
 5 files changed, 27 insertions(+), 53 deletions(-)

diff --git a/algorithm.c b/algorithm.c
index 2c29982..c52ec5f 100644
--- a/algorithm.c
+++ b/algorithm.c
@@ -815,15 +815,11 @@ static cl_int queue_chainox_navi_kernel(struct __clState *clState, struct _dev_b
   if (status != CL_SUCCESS) applog(LOG_ERR, "Error %d: search7 arg failed.", status);
   // simd - search8
   CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
-  CL_SET_ARG_N(1, clState->buffer1);
-  // simd - search9
-  CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
-  CL_SET_ARG_N(1, clState->buffer1);
   if (status != CL_SUCCESS) applog(LOG_ERR, "Error %d: search8 arg failed.", status);
-  // shavite - search11
+  // shavite - search9
   CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
   if (status != CL_SUCCESS) applog(LOG_ERR, "Error %d: search9 arg failed.", status);
-  // echo - search12
+  // echo - search10
   num = 0;
   CL_NEXTKERNEL_SET_ARG(clState->padbuffer8);
   CL_SET_ARG(clState->outputBuffer);
@@ -2682,7 +2678,7 @@ static algorithm_settings_t algos[] = {
 
   { "darkcoin-mod", ALGO_X11, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 10, 8 * 16 * 4194304, 0, darkcoin_regenhash, NULL, NULL, queue_darkcoin_mod_kernel, gen_hash, append_x11_compiler_options },
   { "chainox", ALGO_0X10, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 10, 8 * 16 * 4194304, 0, chainox_regenhash, NULL, NULL, queue_chainox_kernel, gen_hash, append_x11_compiler_options },
-  { "chainox_navi", ALGO_0X10_NAVI, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 11, 8 * 16 * 4194304, 0, chainox_regenhash, NULL, NULL, queue_chainox_navi_kernel, gen_hash, append_x11_compiler_options },
+  { "chainox_navi", ALGO_0X10_NAVI, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 10, 8 * 16 * 4194304, 0, chainox_regenhash, NULL, NULL, queue_chainox_navi_kernel, gen_hash, append_x11_compiler_options },
 
   { "sibcoin-mod", ALGO_X11, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 11, 2 * 16 * 4194304, 0, sibcoin_regenhash, NULL, NULL, queue_sibcoin_mod_kernel, gen_hash, append_x11_compiler_options },
   
diff --git a/driver-opencl.c b/driver-opencl.c
index 3afd7cc..aff4786 100644
--- a/driver-opencl.c
+++ b/driver-opencl.c
@@ -1603,12 +1603,12 @@ if (gpu->algorithm.type != ALGO_MTP && gpu->algorithm.type != ALGO_YESCRYPT_NAVI
       globalOffset2[0] = (*p_global_work_offset) * 4;
       status = clEnqueueNDRangeKernel(clState->commandQueue, clState->extra_kernels[i], 1, globalOffset2,
         globalThreads2, localThreads2, 0, NULL, NULL);
-    } else if (gpu->algorithm.type == ALGO_0X10_NAVI && (i == 7 || i == 8)) {
+    } else if (gpu->algorithm.type == ALGO_0X10_NAVI && (i == 7)) {
       size_t globalThreads2[1];
       size_t localThreads2[1];
       size_t globalOffset2[1];
       globalThreads2[0] = globalThreads[0] * 8;
-      localThreads2[0] = 128;
+      localThreads2[0] = 32;
       globalOffset2[0] = (*p_global_work_offset) * 8;
       status = clEnqueueNDRangeKernel(clState->commandQueue, clState->extra_kernels[i], 1, globalOffset2,
         globalThreads2, localThreads2, 0, NULL, NULL);
diff --git a/kernel/chainox_navi.cl b/kernel/chainox_navi.cl
index d124c2f..2bf0927 100644
--- a/kernel/chainox_navi.cl
+++ b/kernel/chainox_navi.cl
@@ -783,9 +783,11 @@ __kernel void search7(__global hash_t* hashes)
   barrier(CLK_GLOBAL_MEM_FENCE);
 }
 
-__attribute__((reqd_work_group_size(128, 1, 1)))
-__kernel void search8(__global uint* g_hash, __global uint4 *g_temp4)
+__attribute__((reqd_work_group_size(32, 1, 1)))
+__kernel void search8(__global uint* g_hash)
 {
+  __local uint4 g_temp4[4 * 64];
+
   uint gid = get_global_id(0);
   int threadBloc = (gid-get_global_offset(0)) / 8;
   int hashPosition = threadBloc * 16;
@@ -797,25 +799,12 @@ __kernel void search8(__global uint* g_hash, __global uint4 *g_temp4)
   Hash[0] = inpHash[ndx];
   Hash[1] = inpHash[ndx + 8];
 
-  // Puffer für expandierte Nachricht
-  __global uint4 *temp4 = &g_temp4[hashPosition * 4];
-
-  Expansion(Hash, temp4);
-}
-
-__attribute__((reqd_work_group_size(128, 1, 1)))
-__kernel void search9(__global uint* g_hash, __global uint4 *g_fft4)
-{
-  uint gid = get_global_id(0);
-  int threadBloc = (gid-get_global_offset(0)) / 8;
-  int hashPosition = threadBloc * 16;
-  __global uint *Hash = &g_hash[hashPosition];
-
-x11_simd512_gpu_compress_64(Hash, g_fft4);
+  Expansion(Hash, g_temp4);
+  x11_simd512_gpu_compress_64(inpHash, g_temp4);
 }
 
 __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
-__kernel void search10(__global hash_t* hashes)
+__kernel void search9(__global hash_t* hashes)
 {
   uint gid = get_global_id(0);
   __global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
@@ -897,7 +886,7 @@ __kernel void search10(__global hash_t* hashes)
 
 
 __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
-__kernel void search11(__global hash_t* hashes, __global uint* output, const ulong target)
+__kernel void search10(__global hash_t* hashes, __global uint* output, const ulong target)
 {
   uint gid = get_global_id(0);
   __global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
diff --git a/kernel/simd_navi.cl b/kernel/simd_navi.cl
index 3cf48dd..5fda9f1 100644
--- a/kernel/simd_navi.cl
+++ b/kernel/simd_navi.cl
@@ -469,8 +469,9 @@ void inline FFT_256_halfzero(int *y)
     FFT_128_full(y+16);
 }
 
- void Expansion(uint *data, __global uint4 *g_temp4)
+ void Expansion(uint *data, __local uint4 *g_temp4)
  {
+     uint idx = get_local_id(0) >> 3;
      uint plo, phi;
      /* Message Expansion using Number Theoretical Transform similar to FFT */
      int expanded[32];
@@ -517,7 +518,7 @@ void inline FFT_256_halfzero(int *y)
      p1 = expanded[12]; __shfl_right1(p2, expanded[14]); p = even ? p1 : p2;
      q1 = expanded[28]; __shfl_right1(q2, expanded[30]); q = even ? q1 : q2;
      __shfl_cperm0(vec0.w, (int)__byte_perm(mul_185(p),  mul_185(q), 0x05040100));
-     g_temp4[get_local_id(0)&7] = vec0;
+     g_temp4[(get_local_id(0)&7) * 4 + idx] = vec0;
  
  //  1   9   5  13   3  11   7  15      17  25  21  29  19  27  23  31         6 6 6 6 6 6 6 6     6 6 6 6 6 6 6 6
  //  1   9   5  13   3  11   7  15      17  25  21  29  19  27  23  31         2 2 2 2 2 2 2 2     2 2 2 2 2 2 2 2
@@ -538,7 +539,7 @@ void inline FFT_256_halfzero(int *y)
      p1 = expanded[13]; __shfl_right1(p2, expanded[15]); p = even ? p1 : p2;
      q1 = expanded[29]; __shfl_right1(q2, expanded[31]); q = even ? q1 : q2;
      __shfl_cperm1(vec0.w, (int)__byte_perm(mul_185(p),  mul_185(q), 0x05040100));
-     g_temp4[8+(get_local_id(0)&7)] = vec0;
+     g_temp4[(8+(get_local_id(0)&7)) * 4 + idx] = vec0;
  
  //  1   9   5  13   3  11   7  15      17  25  21  29  19  27  23  31         7 7 7 7 7 7 7 7     7 7 7 7 7 7 7 7
  //  1   9   5  13   3  11   7  15      17  25  21  29  19  27  23  31         5 5 5 5 5 5 5 5     5 5 5 5 5 5 5 5
@@ -561,7 +562,7 @@ void inline FFT_256_halfzero(int *y)
      p1 = hi?expanded[13]:expanded[12]; __shfl_left1(p2, hi?expanded[15]:expanded[14]); p = !even ? p1 : p2;
      q1 = hi?expanded[29]:expanded[28]; __shfl_left1(q2, hi?expanded[31]:expanded[30]); q = !even ? q1 : q2;
      __shfl_cperm2(vec0.w, (int)__byte_perm(mul_185(p),  mul_185(q), 0x05040100));
-     g_temp4[16+(get_local_id(0)&7)] = vec0;
+     g_temp4[(16+(get_local_id(0)&7)) * 4 + idx] = vec0;
  
  //  1   9   5  13   3  11   7  15      17  25  21  29  19  27  23  31         1 1 1 1 1 1 1 1     1 1 1 1 1 1 1 1
  //  1   9   5  13   3  11   7  15      17  25  21  29  19  27  23  31         3 3 3 3 3 3 3 3     3 3 3 3 3 3 3 3
@@ -584,7 +585,7 @@ void inline FFT_256_halfzero(int *y)
      p1 = lo?expanded[13]:expanded[12]; __shfl_left1(p2, lo?expanded[15]:expanded[14]); p = !even ? p1 : p2;
      q1 = lo?expanded[29]:expanded[28]; __shfl_left1(q2, lo?expanded[31]:expanded[30]); q = !even ? q1 : q2;
      __shfl_cperm3(vec0.w, (int)__byte_perm(mul_185(p),  mul_185(q) , 0x05040100));
-     g_temp4[24+(get_local_id(0)&7)] = vec0;
+     g_temp4[(24+(get_local_id(0)&7)) * 4 + idx] = vec0;
  
  //  1   9   5  13   3  11   7  15       1   9   5  13   3  11   7  15         0 0 0 0 0 0 0 0     1 1 1 1 1 1 1 1
  //  0   8   4  12   2  10   6  14       0   8   4  12   2  10   6  14         4 4 4 4 4 4 4 4     5 5 5 5 5 5 5 5
@@ -614,7 +615,7 @@ void inline FFT_256_halfzero(int *y)
      q2 = sel?expanded[14]:expanded[15]; __shfl_xor1(p2, q2);
      p = even? p1 : p2; q = even? q1 : q2;
      __shfl_cperm4(vec0.w, (int)__byte_perm(mul_233(p),  mul_233(q) , 0x05040100));
-     g_temp4[32+(get_local_id(0)&7)] = vec0;
+     g_temp4[(32+(get_local_id(0)&7)) * 4 + idx] = vec0;
  
  //  0   8   4  12   2  10   6  14       0   8   4  12   2  10   6  14         6 6 6 6 6 6 6 6     7 7 7 7 7 7 7 7
  //  1   9   5  13   3  11   7  15       1   9   5  13   3  11   7  15         2 2 2 2 2 2 2 2     3 3 3 3 3 3 3 3
@@ -637,7 +638,7 @@ void inline FFT_256_halfzero(int *y)
      q2 = sel?expanded[15]:expanded[14]; __shfl_xor1(p2, q2);
      p = even? p1 : p2; q = even? q1 : q2;
      __shfl_cperm5(vec0.w, (int)__byte_perm(mul_233(p),  mul_233(q) , 0x05040100));
-     g_temp4[40+(get_local_id(0)&7)] = vec0;
+     g_temp4[(40+(get_local_id(0)&7)) * 4 + idx] = vec0;
  
  // 16  24  20  28  18  26  22  30      16  24  20  28  18  26  22  30         6 6 6 6 6 6 6 6     7 7 7 7 7 7 7 7
  // 16  24  20  28  18  26  22  30      16  24  20  28  18  26  22  30         0 0 0 0 0 0 0 0     1 1 1 1 1 1 1 1
@@ -663,7 +664,7 @@ void inline FFT_256_halfzero(int *y)
      __shfl_left4(t, expanded[31]); q2 = sel?t:expanded[30]; __shfl_xor1(p2, q2);
      p = even? p1 : p2; q = even? q1 : q2;
      __shfl_cperm6(vec0.w, (int)__byte_perm(mul_233(p),  mul_233(q) , 0x05040100));
-     g_temp4[48+(get_local_id(0)&7)] = vec0;
+     g_temp4[(48+(get_local_id(0)&7)) * 4 + idx] = vec0;
  
  // 17  25  21  29  19  27  23  31      17  25  21  29  19  27  23  31         4 4 4 4 4 4 4 4     5 5 5 5 5 5 5 5
  // 17  25  21  29  19  27  23  31      17  25  21  29  19  27  23  31         2 2 2 2 2 2 2 2     3 3 3 3 3 3 3 3
@@ -688,7 +689,7 @@ void inline FFT_256_halfzero(int *y)
      __shfl_left4(t, expanded[30]); q2 = sel?expanded[31]:t; __shfl_xor1(p2, q2);
      p = even? p1 : p2; q = even? q1 : q2;
      __shfl_cperm7(vec0.w, (int)__byte_perm(mul_233(p),  mul_233(q) , 0x05040100));
-     g_temp4[56+(get_local_id(0)&7)] = vec0;
+     g_temp4[(56+(get_local_id(0)&7)) * 4 + idx] = vec0;
  
  #undef mul_185
  #undef mul_233
@@ -866,15 +867,15 @@ static inline void Round8_3_final(uint*const  A, const uint r, const  uint s, co
 	STEP8_MAJ_G(d_cw[3][7], 31, u, r, &A[8], &A[16], &A[24], A);
 }
 
-#define expanded_vector(x) g_fft4[x]
+#define expanded_vector(x) g_fft4[(x) * 4 + (get_local_id(0) >> 3)]
 //static __device__ __forceinline__ void expanded_vector(uint* w, const uint4* ptr){
 ////	asm volatile ("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];"  : "=r"(w[0]), "=r"(w[1]), "=r"(w[2]), "=r"(w[3]) : __LDG_PTR(ptr));
 //}
 
-static inline void Round8(uint*const  A, const uint thr_offset, __global uint4 *const g_fft4) {
+static inline void Round8(uint*const  A, const uint thr_offset, __local uint4 *const g_fft4) {
 
 	uint w[8];
-	uint tmp = thr_offset;
+	uint tmp = 0;
 	uint4 hv1, hv2;
 
 	uint r = 3, s = 23, t = 17, u = 27;
@@ -986,7 +987,7 @@ static inline void Round8(uint*const  A, const uint thr_offset, __global uint4 *
 
 }
 
-void x11_simd512_gpu_compress_64(__global uint *Hash, __global uint4 * g_fft4)
+void x11_simd512_gpu_compress_64(__global uint *Hash, __local uint4 * g_fft4)
 {
 uint gid = get_global_id(0);
   uint threadBloc = (gid-get_global_offset(0)) >> 3;
diff --git a/ocl.c b/ocl.c
index ceeec97..52de5ac 100644
--- a/ocl.c
+++ b/ocl.c
@@ -1032,10 +1032,6 @@ if (algorithm->type == ALGO_YESCRYPT || algorithm->type == ALGO_YESCRYPT_NAVI) {
     bufsize = (size_t)algorithm->rw_buffer_size;
     applog(LOG_DEBUG, "Buffer sizes: %lu RW, %lu R", (unsigned long)bufsize, (unsigned long)readbufsize);
   }
-  if (algorithm->type == ALGO_0X10_NAVI) {
-    buf1size = bufsize * 16;
-    applog(LOG_DEBUG, "0x10 buffer sizes: %lu RW, %lu RW", (unsigned long)bufsize, (unsigned long)bufsize);
-  }
 
   clState->padbuffer8 = NULL;
   clState->buffer1 = NULL;
@@ -1115,14 +1111,6 @@ if (algorithm->type == ALGO_YESCRYPT || algorithm->type == ALGO_YESCRYPT_NAVI) {
         return NULL;
       }
     }
-    else if (algorithm->type == ALGO_0X10_NAVI) {
-      // need additionnal buffers
-      clState->buffer1 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, buf1size, NULL, &status);
-      if (status != CL_SUCCESS && !clState->buffer1) {
-        applog(LOG_DEBUG, "Error %d: clCreateBuffer (buffer1), decrease TC or increase LG", status);
-        return NULL;
-      }
-    }
     else if (algorithm->type == ALGO_PHI2 || algorithm->type == ALGO_PHI2_NAVI) {
       clState->buffer1 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, bufsize, NULL, &status);
       if (status != CL_SUCCESS && !clState->buffer1) {