Skip to content

Commit

Permalink
Merge pull request #144 from jargh/main
Browse files Browse the repository at this point in the history
P-384 field inverses and delocator-pacifying tweaks
  • Loading branch information
jargh authored Sep 16, 2024
2 parents 08bf556 + 9019f26 commit d85c6b5
Show file tree
Hide file tree
Showing 39 changed files with 27,057 additions and 861 deletions.
4 changes: 3 additions & 1 deletion arm/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -283,11 +283,13 @@ BIGNUM_OBJ = curve25519/bignum_add_p25519.o \
p384/bignum_demont_p384.o \
p384/bignum_double_p384.o \
p384/bignum_half_p384.o \
p384/bignum_inv_p384.o \
p384/bignum_littleendian_6.o \
p384/bignum_mod_n384.o \
p384/bignum_mod_n384_6.o \
p384/bignum_mod_p384.o \
p384/bignum_mod_p384_6.o \
p384/bignum_montinv_p384.o \
p384/bignum_montmul_p384.o \
p384/bignum_montmul_p384_alt.o \
p384/bignum_montmul_p384_neon.o \
Expand Down Expand Up @@ -378,7 +380,7 @@ OBJ = $(POINT_OBJ) $(BIGNUM_OBJ)

libs2nbignum.a: $(OBJ) ; ar -rc libs2nbignum.a $(OBJ)

clean:; rm -f libs2nbignum.a */*.o */*.correct
clean:; rm -f libs2nbignum.a */*.o */*/*.o */*.correct

# Proof-related parts
#
Expand Down
2 changes: 1 addition & 1 deletion arm/p256/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -65,4 +65,4 @@ OBJ = bignum_add_p256.o \

default: $(OBJ);

clean:; rm -f *.o *.correct
clean:; rm -f *.o *.correct unopt/*.o
8 changes: 4 additions & 4 deletions arm/p256/bignum_montinv_p256.S
Original file line number Diff line number Diff line change
Expand Up @@ -820,9 +820,9 @@ S2N_BN_SYMBOL(bignum_montinv_p256):

mov i, #10
mov d, #1
b midloop
b bignum_montinv_p256_midloop

loop:
bignum_montinv_p256_loop:

// Separate the matrix elements into sign-magnitude pairs

Expand Down Expand Up @@ -1137,7 +1137,7 @@ loop:
stp x1, x3, [v]
stp x2, x5, [v+16]

midloop:
bignum_montinv_p256_midloop:

mov x1, d
ldr x2, [f]
Expand All @@ -1148,7 +1148,7 @@ midloop:
// Next iteration

subs i, i, #1
bne loop
bne bignum_montinv_p256_loop

// The 10th and last iteration does not need anything except the
// u value and the sign of f; the latter can be obtained from the
Expand Down
123 changes: 59 additions & 64 deletions arm/p256/p256_montjscalarmul.S
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,31 @@

#define NSPACE #(31*NUMSIZE)

// Avoid using .rep for the sake of the BoringSSL/AWS-LC delocator,
// which doesn't accept repetitions, assembler macros etc.

#define selectblock(I) \
cmp x14, #(1*I); \
ldp x12, x13, [x15]; \
csel x0, x12, x0, eq; \
csel x1, x13, x1, eq; \
ldp x12, x13, [x15, #16]; \
csel x2, x12, x2, eq; \
csel x3, x13, x3, eq; \
ldp x12, x13, [x15, #32]; \
csel x4, x12, x4, eq; \
csel x5, x13, x5, eq; \
ldp x12, x13, [x15, #48]; \
csel x6, x12, x6, eq; \
csel x7, x13, x7, eq; \
ldp x12, x13, [x15, #64]; \
csel x8, x12, x8, eq; \
csel x9, x13, x9, eq; \
ldp x12, x13, [x15, #80]; \
csel x10, x12, x10, eq; \
csel x11, x13, x11, eq; \
add x15, x15, #96

// Loading large constants

#define movbig(nn,n3,n2,n1,n0) \
Expand Down Expand Up @@ -173,34 +198,34 @@ S2N_BN_SYMBOL(p256_montjscalarmul):

add x0, tab+96*1
add x1, tab
bl local_p256_montjdouble
bl p256_montjscalarmul_p256_montjdouble

add x0, tab+96*2
add x1, tab+96*1
add x2, tab
bl local_p256_montjadd
bl p256_montjscalarmul_p256_montjadd

add x0, tab+96*3
add x1, tab+96*1
bl local_p256_montjdouble
bl p256_montjscalarmul_p256_montjdouble

add x0, tab+96*4
add x1, tab+96*3
add x2, tab
bl local_p256_montjadd
bl p256_montjscalarmul_p256_montjadd

add x0, tab+96*5
add x1, tab+96*2
bl local_p256_montjdouble
bl p256_montjscalarmul_p256_montjdouble

add x0, tab+96*6
add x1, tab+96*5
add x2, tab
bl local_p256_montjadd
bl p256_montjscalarmul_p256_montjadd

add x0, tab+96*7
add x1, tab+96*3
bl local_p256_montjdouble
bl p256_montjscalarmul_p256_montjdouble

// Initialize the accumulator as a table entry for top 4 bits (unrecoded)

Expand All @@ -221,30 +246,15 @@ S2N_BN_SYMBOL(p256_montjscalarmul):
mov x11, xzr
add x15, tab

.set i, 1
.rep 8
cmp x14, #i
ldp x12, x13, [x15]
csel x0, x12, x0, eq
csel x1, x13, x1, eq
ldp x12, x13, [x15, #16]
csel x2, x12, x2, eq
csel x3, x13, x3, eq
ldp x12, x13, [x15, #32]
csel x4, x12, x4, eq
csel x5, x13, x5, eq
ldp x12, x13, [x15, #48]
csel x6, x12, x6, eq
csel x7, x13, x7, eq
ldp x12, x13, [x15, #64]
csel x8, x12, x8, eq
csel x9, x13, x9, eq
ldp x12, x13, [x15, #80]
csel x10, x12, x10, eq
csel x11, x13, x11, eq
add x15, x15, #96
.set i, (i+1)
.endr
selectblock(1)
selectblock(2)
selectblock(3)
selectblock(4)
selectblock(5)
selectblock(6)
selectblock(7)
selectblock(8)

stp x0, x1, [acc]
stp x2, x3, [acc+16]
stp x4, x5, [acc+32]
Expand All @@ -256,24 +266,24 @@ S2N_BN_SYMBOL(p256_montjscalarmul):

// Main loop over size-4 bitfields: double 4 times then add signed digit

loop:
p256_montjscalarmul_mainloop:
sub j, j, #4

add x0, acc
add x1, acc
bl local_p256_montjdouble
bl p256_montjscalarmul_p256_montjdouble

add x0, acc
add x1, acc
bl local_p256_montjdouble
bl p256_montjscalarmul_p256_montjdouble

add x0, acc
add x1, acc
bl local_p256_montjdouble
bl p256_montjscalarmul_p256_montjdouble

add x0, acc
add x1, acc
bl local_p256_montjdouble
bl p256_montjscalarmul_p256_montjdouble

lsr x2, j, #6
ldr x14, [sp, x2, lsl #3] // Exploits scalarb = sp exactly
Expand All @@ -299,30 +309,15 @@ loop:
mov x10, xzr
mov x11, xzr
add x15, tab
.set i, 1
.rep 8
cmp x14, #i
ldp x12, x13, [x15]
csel x0, x12, x0, eq
csel x1, x13, x1, eq
ldp x12, x13, [x15, #16]
csel x2, x12, x2, eq
csel x3, x13, x3, eq
ldp x12, x13, [x15, #32]
csel x4, x12, x4, eq
csel x5, x13, x5, eq
ldp x12, x13, [x15, #48]
csel x6, x12, x6, eq
csel x7, x13, x7, eq
ldp x12, x13, [x15, #64]
csel x8, x12, x8, eq
csel x9, x13, x9, eq
ldp x12, x13, [x15, #80]
csel x10, x12, x10, eq
csel x11, x13, x11, eq
add x15, x15, #96
.set i, (i+1)
.endr

selectblock(1)
selectblock(2)
selectblock(3)
selectblock(4)
selectblock(5)
selectblock(6)
selectblock(7)
selectblock(8)

// Store it to "tabent" with the y coordinate optionally negated
// Again, do it carefully to give coordinates < p_256 even in
Expand Down Expand Up @@ -357,9 +352,9 @@ loop:
add x0, acc
add x1, acc
add x2, tabent
bl local_p256_montjadd
bl p256_montjscalarmul_p256_montjadd

cbnz j, loop
cbnz j, p256_montjscalarmul_mainloop

// That's the end of the main loop, and we just need to copy the
// result in "acc" to the output.
Expand All @@ -386,7 +381,7 @@ loop:

// Local copies of subroutines, complete clones at the moment

local_p256_montjadd:
p256_montjscalarmul_p256_montjadd:
stp x19, x20, [sp, #-16]!
stp x21, x22, [sp, #-16]!
stp x23, x24, [sp, #-16]!
Expand Down Expand Up @@ -3506,7 +3501,7 @@ local_p256_montjadd:
ldp x19, x20, [sp], #16
ret

local_p256_montjdouble:
p256_montjscalarmul_p256_montjdouble:
sub sp, sp, #0x110
stp x19, x20, [sp, #192]
stp x21, x22, [sp, #208]
Expand Down
Loading

0 comments on commit d85c6b5

Please sign in to comment.