Skip to content

Commit

Permalink
Merge pull request #81 from aqjune-aws/tablelookup
Browse files Browse the repository at this point in the history
Add bignum_copy_row_from_table and its Neon-variants for AArch64
  • Loading branch information
jargh authored Sep 16, 2023
2 parents 20ad76e + f1ad23c commit 50aa85b
Show file tree
Hide file tree
Showing 16 changed files with 2,964 additions and 8 deletions.
4 changes: 4 additions & 0 deletions arm/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,10 @@ BIGNUM_OBJ = curve25519/bignum_add_p25519.o \
generic/bignum_cmul.o \
generic/bignum_coprime.o \
generic/bignum_copy.o \
generic/bignum_copy_row_from_table.o \
generic/bignum_copy_row_from_table_8n_neon.o \
generic/bignum_copy_row_from_table_16_neon.o \
generic/bignum_copy_row_from_table_32_neon.o \
generic/bignum_ctd.o \
generic/bignum_ctz.o \
generic/bignum_demont.o \
Expand Down
4 changes: 4 additions & 0 deletions arm/generic/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,10 @@ OBJ = bignum_add.o \
bignum_cmul.o \
bignum_coprime.o \
bignum_copy.o \
bignum_copy_row_from_table.o \
bignum_copy_row_from_table_8n_neon.o \
bignum_copy_row_from_table_16_neon.o \
bignum_copy_row_from_table_32_neon.o \
bignum_ctd.o \
bignum_ctz.o \
bignum_demont.o \
Expand Down
81 changes: 81 additions & 0 deletions arm/generic/bignum_copy_row_from_table.S
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0 OR ISC

// ----------------------------------------------------------------------------
// Given table: uint64_t[height*width], copy table[idx*width...(idx+1)*width-1]
// into z[0..width-1].
// This function is constant-time with respect to the value of `idx`. This is
// achieved by reading the whole table and using the bit-masking to get the
// `idx`-th row.
//
// extern void bignum_copy_from_table
// (uint64_t *z, uint64_t *table, uint64_t height, uint64_t width,
// uint64_t idx);
//
// Standard ARM ABI: X0 = z, X1 = table, X2 = height, X3 = width, X4 = idx
// ----------------------------------------------------------------------------
#include "_internal_s2n_bignum.h"

S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_copy_row_from_table)
S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_copy_row_from_table)
.text
.balign 4

#define z x0
#define table x1
#define height x2
#define width x3
#define idx x4

#define i x5
#define mask x6
#define j x7

S2N_BN_SYMBOL(bignum_copy_row_from_table):

cbz height, bignum_copy_row_from_table_end
cbz width, bignum_copy_row_from_table_end
mov i, width
mov x6, z

bignum_copy_row_from_table_initzero:
str xzr, [x6]
add x6, x6, #8
subs i, i, #1
bne bignum_copy_row_from_table_initzero

mov i, xzr
mov x8, table

bignum_copy_row_from_table_outerloop:

cmp i, idx
csetm mask, eq

mov j, width
mov x9, z

bignum_copy_row_from_table_innerloop:

ldr x10, [x8]
ldr x11, [x9]
and x10, x10, mask
orr x11, x11, x10
str x11, [x9]

add x8, x8, #8
add x9, x9, #8
subs j, j, #1
bne bignum_copy_row_from_table_innerloop

bignum_copy_row_from_table_innerloop_done:
add i, i, #1
cmp i, height
bne bignum_copy_row_from_table_outerloop

bignum_copy_row_from_table_end:
ret

#if defined(__linux__) && defined(__ELF__)
.section .note.GNU-stack,"",%progbits
#endif
126 changes: 126 additions & 0 deletions arm/generic/bignum_copy_row_from_table_16_neon.S
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0 OR ISC

// ----------------------------------------------------------------------------
// Given table: uint64_t[height*16], copy table[idx*16...(idx+1)*16-1]
// into z[0..row-1].
// This function is constant-time with respect to the value of `idx`. This is
// achieved by reading the whole table and using the bit-masking to get the
// `idx`-th row.
//
// extern void bignum_copy_from_table_16_neon
// (uint64_t *z, uint64_t *table, uint64_t height, uint64_t idx);
//
// Initial version written by Hanno Becker
// Standard ARM ABI: X0 = z, X1 = table, X2 = height, X4 = idx
// ----------------------------------------------------------------------------
#include "_internal_s2n_bignum.h"

S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_copy_row_from_table_16_neon)
S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_copy_row_from_table_16_neon)
.text
.balign 4


// *****************************************************
// Main code
// *****************************************************

#define z x0
#define tbl x1
#define height x2
#define idx x3

#define mask x5
#define cnt x6

#define ventry0 v20
#define qentry0 q20
#define ventry1 v21
#define qentry1 q21
#define ventry2 v22
#define qentry2 q22
#define ventry3 v23
#define qentry3 q23
#define ventry4 v24
#define qentry4 q24
#define ventry5 v25
#define qentry5 q25
#define ventry6 v26
#define qentry6 q26
#define ventry7 v27
#define qentry7 q27
#define ventry8 v28

#define vtmp v16
#define qtmp q16

#define vmask v17

S2N_BN_SYMBOL(bignum_copy_row_from_table_16_neon):

// Clear accumulator
// Zeroing can be done via xor, but xor isn't formalized yet.
dup ventry0.2d, xzr
mov ventry1.16b, ventry0.16b
mov ventry2.16b, ventry0.16b
mov ventry3.16b, ventry0.16b
mov ventry4.16b, ventry0.16b
mov ventry5.16b, ventry0.16b
mov ventry6.16b, ventry0.16b
mov ventry7.16b, ventry0.16b

mov cnt, #0
bignum_copy_row_from_table_16_neon_loop:

// Compute mask: Check if current index matches target index
subs xzr, cnt, idx
cinv mask, xzr, eq
dup vmask.2d, mask

ldr qtmp, [tbl, #16*0]
bit ventry0.16b, vtmp.16b, vmask.16b

ldr qtmp, [tbl, #16*1]
bit ventry1.16b, vtmp.16b, vmask.16b

ldr qtmp, [tbl, #16*2]
bit ventry2.16b, vtmp.16b, vmask.16b

ldr qtmp, [tbl, #16*3]
bit ventry3.16b, vtmp.16b, vmask.16b

ldr qtmp, [tbl, #16*4]
bit ventry4.16b, vtmp.16b, vmask.16b

ldr qtmp, [tbl, #16*5]
bit ventry5.16b, vtmp.16b, vmask.16b

ldr qtmp, [tbl, #16*6]
bit ventry6.16b, vtmp.16b, vmask.16b

ldr qtmp, [tbl, #16*7]
bit ventry7.16b, vtmp.16b, vmask.16b

add tbl, tbl, #16*8

add cnt, cnt, #1
subs xzr, height, cnt
b.ne bignum_copy_row_from_table_16_neon_loop

bignum_copy_row_from_table_16_neon_end:

str qentry0, [z, #16*0]
str qentry1, [z, #16*1]
str qentry2, [z, #16*2]
str qentry3, [z, #16*3]
str qentry4, [z, #16*4]
str qentry5, [z, #16*5]
str qentry6, [z, #16*6]
str qentry7, [z, #16*7]

ret

#if defined(__linux__) && defined(__ELF__)
.section .note.GNU-stack,"",%progbits
#endif
Loading

0 comments on commit 50aa85b

Please sign in to comment.