Skip to content

Commit

Permalink
Make some new P-256 and P-384 functions delocator-proof
Browse files Browse the repository at this point in the history
For some new functions that we want to integrate into AWS-LC,
this satisfies the BoringSSL / AWS-LC delocator by (1) making
the labels unique and (2) avoiding .rep / .endr (the assembler
repetition construct), which is replaced by C macro blocks.
  • Loading branch information
jargh committed Aug 23, 2024
1 parent 95b4d64 commit 9aa8155
Show file tree
Hide file tree
Showing 18 changed files with 1,020 additions and 864 deletions.
8 changes: 4 additions & 4 deletions arm/p256/bignum_montinv_p256.S
Original file line number Diff line number Diff line change
Expand Up @@ -820,9 +820,9 @@ S2N_BN_SYMBOL(bignum_montinv_p256):

mov i, #10
mov d, #1
b midloop
b bignum_montinv_p256_midloop

loop:
bignum_montinv_p256_loop:

// Separate the matrix elements into sign-magnitude pairs

Expand Down Expand Up @@ -1137,7 +1137,7 @@ loop:
stp x1, x3, [v]
stp x2, x5, [v+16]

midloop:
bignum_montinv_p256_midloop:

mov x1, d
ldr x2, [f]
Expand All @@ -1148,7 +1148,7 @@ midloop:
// Next iteration

subs i, i, #1
bne loop
bne bignum_montinv_p256_loop

// The 10th and last iteration does not need anything except the
// u value and the sign of f; the latter can be obtained from the
Expand Down
123 changes: 59 additions & 64 deletions arm/p256/p256_montjscalarmul.S
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,31 @@

#define NSPACE #(31*NUMSIZE)

// Avoid using .rep for the sake of the BoringSSL/AWS-LC delocator,
// which doesn't accept repetitions, assembler macros etc.

#define selectblock(I) \
cmp x14, #(I); \
ldp x12, x13, [x15]; \
csel x0, x12, x0, eq; \
csel x1, x13, x1, eq; \
ldp x12, x13, [x15, #16]; \
csel x2, x12, x2, eq; \
csel x3, x13, x3, eq; \
ldp x12, x13, [x15, #32]; \
csel x4, x12, x4, eq; \
csel x5, x13, x5, eq; \
ldp x12, x13, [x15, #48]; \
csel x6, x12, x6, eq; \
csel x7, x13, x7, eq; \
ldp x12, x13, [x15, #64]; \
csel x8, x12, x8, eq; \
csel x9, x13, x9, eq; \
ldp x12, x13, [x15, #80]; \
csel x10, x12, x10, eq; \
csel x11, x13, x11, eq; \
add x15, x15, #96

// Loading large constants

#define movbig(nn,n3,n2,n1,n0) \
Expand Down Expand Up @@ -173,34 +198,34 @@ S2N_BN_SYMBOL(p256_montjscalarmul):

add x0, tab+96*1
add x1, tab
bl local_p256_montjdouble
bl p256_montjscalarmul_p256_montjdouble

add x0, tab+96*2
add x1, tab+96*1
add x2, tab
bl local_p256_montjadd
bl p256_montjscalarmul_p256_montjadd

add x0, tab+96*3
add x1, tab+96*1
bl local_p256_montjdouble
bl p256_montjscalarmul_p256_montjdouble

add x0, tab+96*4
add x1, tab+96*3
add x2, tab
bl local_p256_montjadd
bl p256_montjscalarmul_p256_montjadd

add x0, tab+96*5
add x1, tab+96*2
bl local_p256_montjdouble
bl p256_montjscalarmul_p256_montjdouble

add x0, tab+96*6
add x1, tab+96*5
add x2, tab
bl local_p256_montjadd
bl p256_montjscalarmul_p256_montjadd

add x0, tab+96*7
add x1, tab+96*3
bl local_p256_montjdouble
bl p256_montjscalarmul_p256_montjdouble

// Initialize the accumulator as a table entry for top 4 bits (unrecoded)

Expand All @@ -221,30 +246,15 @@ S2N_BN_SYMBOL(p256_montjscalarmul):
mov x11, xzr
add x15, tab

.set i, 1
.rep 8
cmp x14, #i
ldp x12, x13, [x15]
csel x0, x12, x0, eq
csel x1, x13, x1, eq
ldp x12, x13, [x15, #16]
csel x2, x12, x2, eq
csel x3, x13, x3, eq
ldp x12, x13, [x15, #32]
csel x4, x12, x4, eq
csel x5, x13, x5, eq
ldp x12, x13, [x15, #48]
csel x6, x12, x6, eq
csel x7, x13, x7, eq
ldp x12, x13, [x15, #64]
csel x8, x12, x8, eq
csel x9, x13, x9, eq
ldp x12, x13, [x15, #80]
csel x10, x12, x10, eq
csel x11, x13, x11, eq
add x15, x15, #96
.set i, (i+1)
.endr
selectblock(1)
selectblock(2)
selectblock(3)
selectblock(4)
selectblock(5)
selectblock(6)
selectblock(7)
selectblock(8)

stp x0, x1, [acc]
stp x2, x3, [acc+16]
stp x4, x5, [acc+32]
Expand All @@ -256,24 +266,24 @@ S2N_BN_SYMBOL(p256_montjscalarmul):

// Main loop over size-4 bitfields: double 4 times then add signed digit

loop:
p256_montjscalarmul_mainloop:
sub j, j, #4

add x0, acc
add x1, acc
bl local_p256_montjdouble
bl p256_montjscalarmul_p256_montjdouble

add x0, acc
add x1, acc
bl local_p256_montjdouble
bl p256_montjscalarmul_p256_montjdouble

add x0, acc
add x1, acc
bl local_p256_montjdouble
bl p256_montjscalarmul_p256_montjdouble

add x0, acc
add x1, acc
bl local_p256_montjdouble
bl p256_montjscalarmul_p256_montjdouble

lsr x2, j, #6
ldr x14, [sp, x2, lsl #3] // Exploits scalarb = sp exactly
Expand All @@ -299,30 +309,15 @@ loop:
mov x10, xzr
mov x11, xzr
add x15, tab
.set i, 1
.rep 8
cmp x14, #i
ldp x12, x13, [x15]
csel x0, x12, x0, eq
csel x1, x13, x1, eq
ldp x12, x13, [x15, #16]
csel x2, x12, x2, eq
csel x3, x13, x3, eq
ldp x12, x13, [x15, #32]
csel x4, x12, x4, eq
csel x5, x13, x5, eq
ldp x12, x13, [x15, #48]
csel x6, x12, x6, eq
csel x7, x13, x7, eq
ldp x12, x13, [x15, #64]
csel x8, x12, x8, eq
csel x9, x13, x9, eq
ldp x12, x13, [x15, #80]
csel x10, x12, x10, eq
csel x11, x13, x11, eq
add x15, x15, #96
.set i, (i+1)
.endr

selectblock(1)
selectblock(2)
selectblock(3)
selectblock(4)
selectblock(5)
selectblock(6)
selectblock(7)
selectblock(8)

// Store it to "tabent" with the y coordinate optionally negated
// Again, do it carefully to give coordinates < p_256 even in
Expand Down Expand Up @@ -357,9 +352,9 @@ loop:
add x0, acc
add x1, acc
add x2, tabent
bl local_p256_montjadd
bl p256_montjscalarmul_p256_montjadd

cbnz j, loop
cbnz j, p256_montjscalarmul_mainloop

// That's the end of the main loop, and we just need to copy the
// result in "acc" to the output.
Expand All @@ -386,7 +381,7 @@ loop:

// Local copies of subroutines, complete clones at the moment

local_p256_montjadd:
p256_montjscalarmul_p256_montjadd:
stp x19, x20, [sp, #-16]!
stp x21, x22, [sp, #-16]!
stp x23, x24, [sp, #-16]!
Expand Down Expand Up @@ -3506,7 +3501,7 @@ local_p256_montjadd:
ldp x19, x20, [sp], #16
ret

local_p256_montjdouble:
p256_montjscalarmul_p256_montjdouble:
sub sp, sp, #0x110
stp x19, x20, [sp, #192]
stp x21, x22, [sp, #208]
Expand Down
Loading

0 comments on commit 9aa8155

Please sign in to comment.