Add multiply-add modulo the curve25519/edwards25519 basepoint order

The functions bignum_madd_n25519 and bignum_madd_n25519_alt perform multiply-add modulo the order of the curve25519/edwards25519 basepoint, n_25519 = 2^252 + 27742317777372353535851937790883648493, i.e. z := (x * y + c) mod n_25519 where all variables are 256 bits.
awslabs · Oct 20, 2023 · 7fc5883 · 7fc5883
1 parent e23fd30
commit 7fc5883
Show file tree

Hide file tree

Showing 21 changed files with 5,563 additions and 0 deletions.
diff --git a/arm/Makefile b/arm/Makefile
@@ -113,6 +113,8 @@ BIGNUM_OBJ = curve25519/bignum_add_p25519.o \
              curve25519/bignum_inv_p25519.o \
              curve25519/bignum_invsqrt_p25519.o \
              curve25519/bignum_invsqrt_p25519_alt.o \
+             curve25519/bignum_madd_n25519.o \
+             curve25519/bignum_madd_n25519_alt.o \
              curve25519/bignum_mod_m25519_4.o \
              curve25519/bignum_mod_n25519.o \
              curve25519/bignum_mod_n25519_4.o \

diff --git a/arm/curve25519/Makefile b/arm/curve25519/Makefile
@@ -27,6 +27,8 @@ OBJ = bignum_add_p25519.o \
       bignum_inv_p25519.o \
       bignum_invsqrt_p25519.o \
       bignum_invsqrt_p25519_alt.o \
+      bignum_madd_n25519.o \
+      bignum_madd_n25519_alt.o \
       bignum_mod_m25519_4.o \
       bignum_mod_n25519.o \
       bignum_mod_n25519_4.o \

diff --git a/arm/curve25519/bignum_madd_n25519.S b/arm/curve25519/bignum_madd_n25519.S
@@ -0,0 +1,284 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC
+
+// ----------------------------------------------------------------------------
+// Multiply-add modulo the order of the curve25519/edwards25519 basepoint
+// Inputs x[4], y[4], c[4]; output z[4]
+//
+//    extern void bignum_madd_n25519
+//     (uint64_t z[static 4], uint64_t x[static 4],
+//      uint64_t y[static 4], uint64_t c[static 4]);
+//
+// Performs z := (x * y + c) mod n_25519, where the modulus is
+// n_25519 = 2^252 + 27742317777372353535851937790883648493, the
+// order of the curve25519/edwards25519 basepoint. The result z
+// and the inputs x, y and c are all 4 digits (256 bits).
+//
+// Standard ARM ABI: X0 = z, X1 = x, X2 = y, X3 = c
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_madd_n25519)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_madd_n25519)
+        .text
+        .balign 4
+
+// Backup of the input pointer so we can modify x0
+
+#define z x19
+
+// Temporaries for reduction phase
+
+#define q   x2
+#define n0  x3
+#define n1  x4
+#define t0  x5
+#define t1  x6
+#define t2  x7
+
+// Loading large constants
+
+#define movbig(nn,n3,n2,n1,n0)                                      \
+        movz    nn, n0;                                             \
+        movk    nn, n1, lsl #16;                                    \
+        movk    nn, n2, lsl #32;                                    \
+        movk    nn, n3, lsl #48
+
+// Single round of modular reduction mod_n25519, mapping
+// [m4;m3;m2;m1;m0] = m to [m3;m2;m1;m0] = m mod n_25519,
+// *assuming* the input m < 2^64 * n_25519. This is very
+// close to the loop body of the bignum_mod_n25519 function.
+
+#define reduce(m4,m3,m2,m1,m0)                          \
+        extr    q, m4, m3, #60;                         \
+        and     m3, m3, #0x0FFFFFFFFFFFFFFF;            \
+        sub     q, q, m4, lsr #60;                      \
+        and     t0, m4, #0xF000000000000000;            \
+        add     m3, m3, t0;                             \
+        mul     t0, n0, q;                              \
+        mul     t1, n1, q;                              \
+        umulh   t2, n0, q;                              \
+        adds    t1, t1, t2;                             \
+        umulh   t2, n1, q;                              \
+        adc     t2, t2, xzr;                            \
+        subs    m0, m0, t0;                             \
+        sbcs    m1, m1, t1;                             \
+        sbcs    m2, m2, t2;                             \
+        sbcs    m3, m3, xzr;                            \
+        csel    t0, n0, xzr, cc;                        \
+        csel    t1, n1, xzr, cc;                        \
+        adds    m0, m0, t0;                             \
+        and     t2, t0, #0x1000000000000000;            \
+        adcs    m1, m1, t1;                             \
+        adcs    m2, m2, xzr;                            \
+        adc     m3, m3, t2
+
+// Special case of "reduce" with m4 = 0. As well as not using m4,
+// the quotient selection is slightly simpler, just floor(m/2^252)
+// versus min (floor(m/2^252)) (2^63-1).
+
+#define reduce0(m3,m2,m1,m0)                            \
+        lsr     q, m3, #60;                             \
+        and     m3, m3, #0x0FFFFFFFFFFFFFFF;            \
+        mul     t0, n0, q;                              \
+        mul     t1, n1, q;                              \
+        umulh   t2, n0, q;                              \
+        adds    t1, t1, t2;                             \
+        umulh   t2, n1, q;                              \
+        adc     t2, t2, xzr;                            \
+        subs    m0, m0, t0;                             \
+        sbcs    m1, m1, t1;                             \
+        sbcs    m2, m2, t2;                             \
+        sbcs    m3, m3, xzr;                            \
+        csel    t0, n0, xzr, cc;                        \
+        csel    t1, n1, xzr, cc;                        \
+        adds    m0, m0, t0;                             \
+        and     t2, t0, #0x1000000000000000;            \
+        adcs    m1, m1, t1;                             \
+        adcs    m2, m2, xzr;                            \
+        adc     m3, m3, t2
+
+S2N_BN_SYMBOL(bignum_madd_n25519):
+
+        stp     x19, x20, [sp, -16]!
+
+// Back up the result pointer so we can overwrite x0 in intermediate steps
+
+        mov     z, x0
+
+// First compute [x15;x14;x13;x12;x11;x10;x9;x8] = x * y. This is
+// a basic 2-level Karatsuba multiplier, similar to the start of
+// bignum_mul_p25519, but with changes to the register allocation,
+// which in particular preserve x3/w3 for the next step.
+
+        ldp     x0, x4, [x1]
+        ldp     x5, x6, [x2]
+        umull   x8, w0, w5
+        lsr     x17, x0, #32
+        umull   x7, w17, w5
+        lsr     x16, x5, #32
+        umull   x9, w16, w17
+        umull   x16, w0, w16
+        adds    x8, x8, x7, lsl #32
+        lsr     x7, x7, #32
+        adc     x9, x9, x7
+        adds    x8, x8, x16, lsl #32
+        lsr     x16, x16, #32
+        adc     x9, x9, x16
+        mul     x10, x4, x6
+        umulh   x11, x4, x6
+        subs    x4, x4, x0
+        cneg    x4, x4, cc
+        csetm   x16, cc
+        adds    x10, x10, x9
+        adc     x11, x11, xzr
+        subs    x0, x5, x6
+        cneg    x0, x0, cc
+        cinv    x16, x16, cc
+        mul     x7, x4, x0
+        umulh   x0, x4, x0
+        adds    x9, x8, x10
+        adcs    x10, x10, x11
+        adc     x11, x11, xzr
+        cmn     x16, #0x1
+        eor     x7, x7, x16
+        adcs    x9, x7, x9
+        eor     x0, x0, x16
+        adcs    x10, x0, x10
+        adc     x11, x11, x16
+        ldp     x0, x4, [x1, #16]
+        ldp     x5, x6, [x2, #16]
+        umull   x12, w0, w5
+        lsr     x17, x0, #32
+        umull   x7, w17, w5
+        lsr     x16, x5, #32
+        umull   x13, w16, w17
+        umull   x16, w0, w16
+        adds    x12, x12, x7, lsl #32
+        lsr     x7, x7, #32
+        adc     x13, x13, x7
+        adds    x12, x12, x16, lsl #32
+        lsr     x16, x16, #32
+        adc     x13, x13, x16
+        mul     x14, x4, x6
+        umulh   x15, x4, x6
+        subs    x4, x4, x0
+        cneg    x4, x4, cc
+        csetm   x16, cc
+        adds    x14, x14, x13
+        adc     x15, x15, xzr
+        subs    x0, x5, x6
+        cneg    x0, x0, cc
+        cinv    x16, x16, cc
+        mul     x7, x4, x0
+        umulh   x0, x4, x0
+        adds    x13, x12, x14
+        adcs    x14, x14, x15
+        adc     x15, x15, xzr
+        cmn     x16, #0x1
+        eor     x7, x7, x16
+        adcs    x13, x7, x13
+        eor     x0, x0, x16
+        adcs    x14, x0, x14
+        adc     x15, x15, x16
+        ldp     x0, x4, [x1, #16]
+        ldp     x7, x16, [x1]
+        subs    x0, x0, x7
+        sbcs    x4, x4, x16
+        csetm   x16, cc
+        ldp     x7, x17, [x2]
+        subs    x5, x7, x5
+        sbcs    x6, x17, x6
+        csetm   x17, cc
+        eor     x0, x0, x16
+        subs    x0, x0, x16
+        eor     x4, x4, x16
+        sbc     x4, x4, x16
+        eor     x5, x5, x17
+        subs    x5, x5, x17
+        eor     x6, x6, x17
+        sbc     x6, x6, x17
+        eor     x16, x17, x16
+        adds    x12, x12, x10
+        adcs    x13, x13, x11
+        adcs    x14, x14, xzr
+        adc     x15, x15, xzr
+        mul     x2, x0, x5
+        umulh   x17, x0, x5
+        mul     x7, x4, x6
+        umulh   x1, x4, x6
+        subs    x4, x4, x0
+        cneg    x4, x4, cc
+        csetm   x10, cc
+        adds    x7, x7, x17
+        adc     x1, x1, xzr
+        subs    x6, x5, x6
+        cneg    x6, x6, cc
+        cinv    x10, x10, cc
+        mul     x5, x4, x6
+        umulh   x6, x4, x6
+        adds    x17, x2, x7
+        adcs    x7, x7, x1
+        adc     x1, x1, xzr
+        cmn     x10, #0x1
+        eor     x5, x5, x10
+        adcs    x17, x5, x17
+        eor     x6, x6, x10
+        adcs    x7, x6, x7
+        adc     x1, x1, x10
+        adds    x10, x12, x8
+        adcs    x11, x13, x9
+        adcs    x12, x14, x12
+        adcs    x13, x15, x13
+        adcs    x14, x14, xzr
+        adc     x15, x15, xzr
+        cmn     x16, #0x1
+        eor     x2, x2, x16
+        adcs    x10, x2, x10
+        eor     x17, x17, x16
+        adcs    x11, x17, x11
+        eor     x7, x7, x16
+        adcs    x12, x7, x12
+        eor     x1, x1, x16
+        adcs    x13, x1, x13
+        adcs    x14, x14, x16
+        adc     x15, x15, x16
+
+// Add the constant term, so [x15;x14;x13;x12;x11;x10;x9;x8] = x * y + c
+// It's easier to just do this now versus incorporating it into the
+// Karatsuba steps above or deferring it until partway through the
+// reduction, though it does result in a long carry propagation here.
+
+        ldp     x0, x1, [x3]
+        adds    x8, x8, x0
+        adcs    x9, x9, x1
+        ldp     x0, x1, [x3, #16]
+        adcs    x10, x10, x0
+        adcs    x11, x11, x1
+        adcs    x12, x12, xzr
+        adcs    x13, x13, xzr
+        adcs    x14, x14, xzr
+        adc     x15, x15, xzr
+
+// Now do the modular reduction and write back
+
+        movbig( n0, #0x5812, #0x631a, #0x5cf5, #0xd3ed)
+        movbig( n1, #0x14de, #0xf9de, #0xa2f7, #0x9cd6)
+
+        reduce0(x15,x14,x13,x12)
+        reduce(x15,x14,x13,x12,x11)
+        reduce(x14,x13,x12,x11,x10)
+        reduce(x13,x12,x11,x10,x9)
+        reduce(x12,x11,x10,x9,x8)
+
+        stp     x8, x9, [z]
+        stp     x10, x11, [z, #16]
+
+// Restore registers and return
+
+        ldp     x19, x20, [sp], 16
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stacz,"",%progbits
+#endif