From 981e2a7ab7506fca90fe4cf1edce9a14b4e4b8c0 Mon Sep 17 00:00:00 2001 From: Yawning Angel Date: Fri, 6 Aug 2021 14:35:01 +0000 Subject: [PATCH 1/3] internal/tony: Initial import This is a series of wrappers around fiat-crypto's curve25519 code, along with working around braindamage caused by the lack of `//go:inline`. Upstream commit: f1951b3d80f1d0e402a832682875667ae52e8028 --- go.mod | 1 + go.sum | 2 + internal/tony/tony.go | 38 ++ internal/tony/tony_optimizations_u64.go | 809 ++++++++++++++++++++++++ internal/tony/tony_u32.go | 141 +++++ internal/tony/tony_u64.go | 135 ++++ 6 files changed, 1126 insertions(+) create mode 100644 internal/tony/tony.go create mode 100644 internal/tony/tony_optimizations_u64.go create mode 100644 internal/tony/tony_u32.go create mode 100644 internal/tony/tony_u64.go diff --git a/go.mod b/go.mod index 8252b18..5a06000 100644 --- a/go.mod +++ b/go.mod @@ -3,6 +3,7 @@ module github.com/oasisprotocol/curve25519-voi go 1.16 require ( + github.com/mit-plv/fiat-crypto/fiat-go v0.0.0-20210807234606-f1951b3d80f1 golang.org/x/crypto v0.0.0-20201221181555-eec23a3978ad golang.org/x/sys v0.0.0-20191026070338-33540a1f6037 ) diff --git a/go.sum b/go.sum index 0d16f09..c47ad84 100644 --- a/go.sum +++ b/go.sum @@ -1,3 +1,5 @@ +github.com/mit-plv/fiat-crypto/fiat-go v0.0.0-20210807234606-f1951b3d80f1 h1:tclc3vLlQfZi9scV1zBCXboNWiG3bpbz7O1gwybPyNQ= +github.com/mit-plv/fiat-crypto/fiat-go v0.0.0-20210807234606-f1951b3d80f1/go.mod h1:59UI5/2yBTcSl1/+qCCOTsfXYy290H670oWjGFRqOLs= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20201221181555-eec23a3978ad h1:DN0cp81fZ3njFcrLCytUHRSUkqBjfTo4Tx9RJTWs0EY= golang.org/x/crypto v0.0.0-20201221181555-eec23a3978ad/go.mod h1:jdWPYTVW3xRLrWPugEBEK3UY2ZEsg3UU495nc5E+M+I= diff --git a/internal/tony/tony.go b/internal/tony/tony.go new file mode 100644 index 0000000..e7cf190 --- /dev/null +++ b/internal/tony/tony.go @@ -0,0 +1,38 @@ +// Copyright (c) 2021 Oasis Labs Inc. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the copyright holder nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED +// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Package tony wraps fiat-crypto and provides some useful helpers and +// fixes. +package tony + +// RelaxCast converts from a TightFieldElement to a LooseFieldElement +// by casting. +func (tfe *TightFieldElement) RelaxCast() *LooseFieldElement { + return (*LooseFieldElement)(tfe) +} diff --git a/internal/tony/tony_optimizations_u64.go b/internal/tony/tony_optimizations_u64.go new file mode 100644 index 0000000..7ce733f --- /dev/null +++ b/internal/tony/tony_optimizations_u64.go @@ -0,0 +1,809 @@ +// The BSD 1-Clause License (BSD-1-Clause) +// +// Copyright (c) 2015-2020 the fiat-crypto authors (see the AUTHORS file) +// All rights reserved. +// Copyright (c) 2021 Oasis Labs Inc. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// THIS SOFTWARE IS PROVIDED BY the fiat-crypto authors "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL Berkeley Software Design, +// Inc. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +//go:build (amd64 || arm64 || ppc64le || ppc64 || s390x || force64bit) && !force32bit +// +build amd64 arm64 ppc64le ppc64 s390x force64bit +// +build !force32bit + +package tony + +import "math/bits" + +type ( + uint1 uint64 + int1 int64 +) + +// W A R N I N G +// ------------------------------ +// Om Marishi Sowaka +// The big enemy is approaching +// at full trottle. +// According to the data, it is +// identified as "Optimizations". +// ------------------------------ +// NO REFUGE +// +// There really isn't anything wrong with the fiat-crypto Go code. +// There is however, lots of things that are wrong with the Go compiler. +// +// * The inliner is awful, and there is no way to force inlining +// (golang/go#21536). In addition to the fused `Add`/`Sub`/`Opp` +// + `Carry` that upstream added for us, this does even more manual +// inlining. +// +// * CarryMulAdd `a * (b + c)` +// * CarryMulSub `a * (b - c)` +// * CarryPow2k `a^(2k)`, where `k > 0`. This is probably the one +// case where it is unreasonable to expect the compiler to auto +// inline the routine. This also is the most impactful, as it +// dramatically improves inverson/sqrt performance. +// * cmovznzU64/addcarryxU51/subborrowxU51 function signature changed +// to speed up ToBytes. +// +// The dream is that eventually this file will go away entirely, but +// it's hard to get away from needing manual inlining. +// + +func carryMulAddInlined(out1 *TightFieldElement, arg1 *LooseFieldElement, arg2, arg3 *TightFieldElement) { + // Add (arg2 + arg3) + a0 := (arg2[0] + arg3[0]) + a1 := (arg2[1] + arg3[1]) + a2 := (arg2[2] + arg3[2]) + a3 := (arg2[3] + arg3[3]) + a4 := (arg2[4] + arg3[4]) + + // Mul (arg1 * (arg2 + arg3)) + var x1 uint64 + var x2 uint64 + x2, x1 = bits.Mul64(arg1[4], (a4 * 0x13)) + var x3 uint64 + var x4 uint64 + x4, x3 = bits.Mul64(arg1[4], (a3 * 0x13)) + var x5 uint64 + var x6 uint64 + x6, x5 = bits.Mul64(arg1[4], (a2 * 0x13)) + var x7 uint64 + var x8 uint64 + x8, x7 = bits.Mul64(arg1[4], (a1 * 0x13)) + var x9 uint64 + var x10 uint64 + x10, x9 = bits.Mul64(arg1[3], (a4 * 0x13)) + var x11 uint64 + var x12 uint64 + x12, x11 = bits.Mul64(arg1[3], (a3 * 0x13)) + var x13 uint64 + var x14 uint64 + x14, x13 = bits.Mul64(arg1[3], (a2 * 0x13)) + var x15 uint64 + var x16 uint64 + x16, x15 = bits.Mul64(arg1[2], (a4 * 0x13)) + var x17 uint64 + var x18 uint64 + x18, x17 = bits.Mul64(arg1[2], (a3 * 0x13)) + var x19 uint64 + var x20 uint64 + x20, x19 = bits.Mul64(arg1[1], (a4 * 0x13)) + var x21 uint64 + var x22 uint64 + x22, x21 = bits.Mul64(arg1[4], a0) + var x23 uint64 + var x24 uint64 + x24, x23 = bits.Mul64(arg1[3], a1) + var x25 uint64 + var x26 uint64 + x26, x25 = bits.Mul64(arg1[3], a0) + var x27 uint64 + var x28 uint64 + x28, x27 = bits.Mul64(arg1[2], a2) + var x29 uint64 + var x30 uint64 + x30, x29 = bits.Mul64(arg1[2], a1) + var x31 uint64 + var x32 uint64 + x32, x31 = bits.Mul64(arg1[2], a0) + var x33 uint64 + var x34 uint64 + x34, x33 = bits.Mul64(arg1[1], a3) + var x35 uint64 + var x36 uint64 + x36, x35 = bits.Mul64(arg1[1], a2) + var x37 uint64 + var x38 uint64 + x38, x37 = bits.Mul64(arg1[1], a1) + var x39 uint64 + var x40 uint64 + x40, x39 = bits.Mul64(arg1[1], a0) + var x41 uint64 + var x42 uint64 + x42, x41 = bits.Mul64(arg1[0], a4) + var x43 uint64 + var x44 uint64 + x44, x43 = bits.Mul64(arg1[0], a3) + var x45 uint64 + var x46 uint64 + x46, x45 = bits.Mul64(arg1[0], a2) + var x47 uint64 + var x48 uint64 + x48, x47 = bits.Mul64(arg1[0], a1) + var x49 uint64 + var x50 uint64 + x50, x49 = bits.Mul64(arg1[0], a0) + var x51 uint64 + var x52 uint64 + x51, x52 = bits.Add64(x13, x7, uint64(0x0)) + var x53 uint64 + x53, _ = bits.Add64(x14, x8, uint64(uint1(x52))) + var x55 uint64 + var x56 uint64 + x55, x56 = bits.Add64(x17, x51, uint64(0x0)) + var x57 uint64 + x57, _ = bits.Add64(x18, x53, uint64(uint1(x56))) + var x59 uint64 + var x60 uint64 + x59, x60 = bits.Add64(x19, x55, uint64(0x0)) + var x61 uint64 + x61, _ = bits.Add64(x20, x57, uint64(uint1(x60))) + var x63 uint64 + var x64 uint64 + x63, x64 = bits.Add64(x49, x59, uint64(0x0)) + var x65 uint64 + x65, _ = bits.Add64(x50, x61, uint64(uint1(x64))) + x67 := ((x63 >> 51) | ((x65 << 13) & 0xffffffffffffffff)) + x68 := (x63 & 0x7ffffffffffff) + var x69 uint64 + var x70 uint64 + x69, x70 = bits.Add64(x23, x21, uint64(0x0)) + var x71 uint64 + x71, _ = bits.Add64(x24, x22, uint64(uint1(x70))) + var x73 uint64 + var x74 uint64 + x73, x74 = bits.Add64(x27, x69, uint64(0x0)) + var x75 uint64 + x75, _ = bits.Add64(x28, x71, uint64(uint1(x74))) + var x77 uint64 + var x78 uint64 + x77, x78 = bits.Add64(x33, x73, uint64(0x0)) + var x79 uint64 + x79, _ = bits.Add64(x34, x75, uint64(uint1(x78))) + var x81 uint64 + var x82 uint64 + x81, x82 = bits.Add64(x41, x77, uint64(0x0)) + var x83 uint64 + x83, _ = bits.Add64(x42, x79, uint64(uint1(x82))) + var x85 uint64 + var x86 uint64 + x85, x86 = bits.Add64(x25, x1, uint64(0x0)) + var x87 uint64 + x87, _ = bits.Add64(x26, x2, uint64(uint1(x86))) + var x89 uint64 + var x90 uint64 + x89, x90 = bits.Add64(x29, x85, uint64(0x0)) + var x91 uint64 + x91, _ = bits.Add64(x30, x87, uint64(uint1(x90))) + var x93 uint64 + var x94 uint64 + x93, x94 = bits.Add64(x35, x89, uint64(0x0)) + var x95 uint64 + x95, _ = bits.Add64(x36, x91, uint64(uint1(x94))) + var x97 uint64 + var x98 uint64 + x97, x98 = bits.Add64(x43, x93, uint64(0x0)) + var x99 uint64 + x99, _ = bits.Add64(x44, x95, uint64(uint1(x98))) + var x101 uint64 + var x102 uint64 + x101, x102 = bits.Add64(x9, x3, uint64(0x0)) + var x103 uint64 + x103, _ = bits.Add64(x10, x4, uint64(uint1(x102))) + var x105 uint64 + var x106 uint64 + x105, x106 = bits.Add64(x31, x101, uint64(0x0)) + var x107 uint64 + x107, _ = bits.Add64(x32, x103, uint64(uint1(x106))) + var x109 uint64 + var x110 uint64 + x109, x110 = bits.Add64(x37, x105, uint64(0x0)) + var x111 uint64 + x111, _ = bits.Add64(x38, x107, uint64(uint1(x110))) + var x113 uint64 + var x114 uint64 + x113, x114 = bits.Add64(x45, x109, uint64(0x0)) + var x115 uint64 + x115, _ = bits.Add64(x46, x111, uint64(uint1(x114))) + var x117 uint64 + var x118 uint64 + x117, x118 = bits.Add64(x11, x5, uint64(0x0)) + var x119 uint64 + x119, _ = bits.Add64(x12, x6, uint64(uint1(x118))) + var x121 uint64 + var x122 uint64 + x121, x122 = bits.Add64(x15, x117, uint64(0x0)) + var x123 uint64 + x123, _ = bits.Add64(x16, x119, uint64(uint1(x122))) + var x125 uint64 + var x126 uint64 + x125, x126 = bits.Add64(x39, x121, uint64(0x0)) + var x127 uint64 + x127, _ = bits.Add64(x40, x123, uint64(uint1(x126))) + var x129 uint64 + var x130 uint64 + x129, x130 = bits.Add64(x47, x125, uint64(0x0)) + var x131 uint64 + x131, _ = bits.Add64(x48, x127, uint64(uint1(x130))) + var x133 uint64 + var x134 uint64 + x133, x134 = bits.Add64(x67, x129, uint64(0x0)) + x135 := (uint64(uint1(x134)) + x131) + x136 := ((x133 >> 51) | ((x135 << 13) & 0xffffffffffffffff)) + x137 := (x133 & 0x7ffffffffffff) + var x138 uint64 + var x139 uint64 + x138, x139 = bits.Add64(x136, x113, uint64(0x0)) + x140 := (uint64(uint1(x139)) + x115) + x141 := ((x138 >> 51) | ((x140 << 13) & 0xffffffffffffffff)) + x142 := (x138 & 0x7ffffffffffff) + var x143 uint64 + var x144 uint64 + x143, x144 = bits.Add64(x141, x97, uint64(0x0)) + x145 := (uint64(uint1(x144)) + x99) + x146 := ((x143 >> 51) | ((x145 << 13) & 0xffffffffffffffff)) + x147 := (x143 & 0x7ffffffffffff) + var x148 uint64 + var x149 uint64 + x148, x149 = bits.Add64(x146, x81, uint64(0x0)) + x150 := (uint64(uint1(x149)) + x83) + x151 := ((x148 >> 51) | ((x150 << 13) & 0xffffffffffffffff)) + x152 := (x148 & 0x7ffffffffffff) + x153 := (x151 * 0x13) + x154 := (x68 + x153) + x155 := (x154 >> 51) + x156 := (x154 & 0x7ffffffffffff) + x157 := (x155 + x137) + x158 := uint1((x157 >> 51)) + x159 := (x157 & 0x7ffffffffffff) + x160 := (uint64(x158) + x142) + out1[0] = x156 + out1[1] = x159 + out1[2] = x160 + out1[3] = x147 + out1[4] = x152 +} + +func carryMulSubInlined(out1 *TightFieldElement, arg1 *LooseFieldElement, arg2, arg3 *TightFieldElement) { + // Sub (arg2 - arg3) + a0 := ((0xfffffffffffda + arg2[0]) - arg3[0]) + a1 := ((0xffffffffffffe + arg2[1]) - arg3[1]) + a2 := ((0xffffffffffffe + arg2[2]) - arg3[2]) + a3 := ((0xffffffffffffe + arg2[3]) - arg3[3]) + a4 := ((0xffffffffffffe + arg2[4]) - arg3[4]) + + // Mul (arg1 * (arg2 - arg3)) + var x1 uint64 + var x2 uint64 + x2, x1 = bits.Mul64(arg1[4], (a4 * 0x13)) + var x3 uint64 + var x4 uint64 + x4, x3 = bits.Mul64(arg1[4], (a3 * 0x13)) + var x5 uint64 + var x6 uint64 + x6, x5 = bits.Mul64(arg1[4], (a2 * 0x13)) + var x7 uint64 + var x8 uint64 + x8, x7 = bits.Mul64(arg1[4], (a1 * 0x13)) + var x9 uint64 + var x10 uint64 + x10, x9 = bits.Mul64(arg1[3], (a4 * 0x13)) + var x11 uint64 + var x12 uint64 + x12, x11 = bits.Mul64(arg1[3], (a3 * 0x13)) + var x13 uint64 + var x14 uint64 + x14, x13 = bits.Mul64(arg1[3], (a2 * 0x13)) + var x15 uint64 + var x16 uint64 + x16, x15 = bits.Mul64(arg1[2], (a4 * 0x13)) + var x17 uint64 + var x18 uint64 + x18, x17 = bits.Mul64(arg1[2], (a3 * 0x13)) + var x19 uint64 + var x20 uint64 + x20, x19 = bits.Mul64(arg1[1], (a4 * 0x13)) + var x21 uint64 + var x22 uint64 + x22, x21 = bits.Mul64(arg1[4], a0) + var x23 uint64 + var x24 uint64 + x24, x23 = bits.Mul64(arg1[3], a1) + var x25 uint64 + var x26 uint64 + x26, x25 = bits.Mul64(arg1[3], a0) + var x27 uint64 + var x28 uint64 + x28, x27 = bits.Mul64(arg1[2], a2) + var x29 uint64 + var x30 uint64 + x30, x29 = bits.Mul64(arg1[2], a1) + var x31 uint64 + var x32 uint64 + x32, x31 = bits.Mul64(arg1[2], a0) + var x33 uint64 + var x34 uint64 + x34, x33 = bits.Mul64(arg1[1], a3) + var x35 uint64 + var x36 uint64 + x36, x35 = bits.Mul64(arg1[1], a2) + var x37 uint64 + var x38 uint64 + x38, x37 = bits.Mul64(arg1[1], a1) + var x39 uint64 + var x40 uint64 + x40, x39 = bits.Mul64(arg1[1], a0) + var x41 uint64 + var x42 uint64 + x42, x41 = bits.Mul64(arg1[0], a4) + var x43 uint64 + var x44 uint64 + x44, x43 = bits.Mul64(arg1[0], a3) + var x45 uint64 + var x46 uint64 + x46, x45 = bits.Mul64(arg1[0], a2) + var x47 uint64 + var x48 uint64 + x48, x47 = bits.Mul64(arg1[0], a1) + var x49 uint64 + var x50 uint64 + x50, x49 = bits.Mul64(arg1[0], a0) + var x51 uint64 + var x52 uint64 + x51, x52 = bits.Add64(x13, x7, uint64(0x0)) + var x53 uint64 + x53, _ = bits.Add64(x14, x8, uint64(uint1(x52))) + var x55 uint64 + var x56 uint64 + x55, x56 = bits.Add64(x17, x51, uint64(0x0)) + var x57 uint64 + x57, _ = bits.Add64(x18, x53, uint64(uint1(x56))) + var x59 uint64 + var x60 uint64 + x59, x60 = bits.Add64(x19, x55, uint64(0x0)) + var x61 uint64 + x61, _ = bits.Add64(x20, x57, uint64(uint1(x60))) + var x63 uint64 + var x64 uint64 + x63, x64 = bits.Add64(x49, x59, uint64(0x0)) + var x65 uint64 + x65, _ = bits.Add64(x50, x61, uint64(uint1(x64))) + x67 := ((x63 >> 51) | ((x65 << 13) & 0xffffffffffffffff)) + x68 := (x63 & 0x7ffffffffffff) + var x69 uint64 + var x70 uint64 + x69, x70 = bits.Add64(x23, x21, uint64(0x0)) + var x71 uint64 + x71, _ = bits.Add64(x24, x22, uint64(uint1(x70))) + var x73 uint64 + var x74 uint64 + x73, x74 = bits.Add64(x27, x69, uint64(0x0)) + var x75 uint64 + x75, _ = bits.Add64(x28, x71, uint64(uint1(x74))) + var x77 uint64 + var x78 uint64 + x77, x78 = bits.Add64(x33, x73, uint64(0x0)) + var x79 uint64 + x79, _ = bits.Add64(x34, x75, uint64(uint1(x78))) + var x81 uint64 + var x82 uint64 + x81, x82 = bits.Add64(x41, x77, uint64(0x0)) + var x83 uint64 + x83, _ = bits.Add64(x42, x79, uint64(uint1(x82))) + var x85 uint64 + var x86 uint64 + x85, x86 = bits.Add64(x25, x1, uint64(0x0)) + var x87 uint64 + x87, _ = bits.Add64(x26, x2, uint64(uint1(x86))) + var x89 uint64 + var x90 uint64 + x89, x90 = bits.Add64(x29, x85, uint64(0x0)) + var x91 uint64 + x91, _ = bits.Add64(x30, x87, uint64(uint1(x90))) + var x93 uint64 + var x94 uint64 + x93, x94 = bits.Add64(x35, x89, uint64(0x0)) + var x95 uint64 + x95, _ = bits.Add64(x36, x91, uint64(uint1(x94))) + var x97 uint64 + var x98 uint64 + x97, x98 = bits.Add64(x43, x93, uint64(0x0)) + var x99 uint64 + x99, _ = bits.Add64(x44, x95, uint64(uint1(x98))) + var x101 uint64 + var x102 uint64 + x101, x102 = bits.Add64(x9, x3, uint64(0x0)) + var x103 uint64 + x103, _ = bits.Add64(x10, x4, uint64(uint1(x102))) + var x105 uint64 + var x106 uint64 + x105, x106 = bits.Add64(x31, x101, uint64(0x0)) + var x107 uint64 + x107, _ = bits.Add64(x32, x103, uint64(uint1(x106))) + var x109 uint64 + var x110 uint64 + x109, x110 = bits.Add64(x37, x105, uint64(0x0)) + var x111 uint64 + x111, _ = bits.Add64(x38, x107, uint64(uint1(x110))) + var x113 uint64 + var x114 uint64 + x113, x114 = bits.Add64(x45, x109, uint64(0x0)) + var x115 uint64 + x115, _ = bits.Add64(x46, x111, uint64(uint1(x114))) + var x117 uint64 + var x118 uint64 + x117, x118 = bits.Add64(x11, x5, uint64(0x0)) + var x119 uint64 + x119, _ = bits.Add64(x12, x6, uint64(uint1(x118))) + var x121 uint64 + var x122 uint64 + x121, x122 = bits.Add64(x15, x117, uint64(0x0)) + var x123 uint64 + x123, _ = bits.Add64(x16, x119, uint64(uint1(x122))) + var x125 uint64 + var x126 uint64 + x125, x126 = bits.Add64(x39, x121, uint64(0x0)) + var x127 uint64 + x127, _ = bits.Add64(x40, x123, uint64(uint1(x126))) + var x129 uint64 + var x130 uint64 + x129, x130 = bits.Add64(x47, x125, uint64(0x0)) + var x131 uint64 + x131, _ = bits.Add64(x48, x127, uint64(uint1(x130))) + var x133 uint64 + var x134 uint64 + x133, x134 = bits.Add64(x67, x129, uint64(0x0)) + x135 := (uint64(uint1(x134)) + x131) + x136 := ((x133 >> 51) | ((x135 << 13) & 0xffffffffffffffff)) + x137 := (x133 & 0x7ffffffffffff) + var x138 uint64 + var x139 uint64 + x138, x139 = bits.Add64(x136, x113, uint64(0x0)) + x140 := (uint64(uint1(x139)) + x115) + x141 := ((x138 >> 51) | ((x140 << 13) & 0xffffffffffffffff)) + x142 := (x138 & 0x7ffffffffffff) + var x143 uint64 + var x144 uint64 + x143, x144 = bits.Add64(x141, x97, uint64(0x0)) + x145 := (uint64(uint1(x144)) + x99) + x146 := ((x143 >> 51) | ((x145 << 13) & 0xffffffffffffffff)) + x147 := (x143 & 0x7ffffffffffff) + var x148 uint64 + var x149 uint64 + x148, x149 = bits.Add64(x146, x81, uint64(0x0)) + x150 := (uint64(uint1(x149)) + x83) + x151 := ((x148 >> 51) | ((x150 << 13) & 0xffffffffffffffff)) + x152 := (x148 & 0x7ffffffffffff) + x153 := (x151 * 0x13) + x154 := (x68 + x153) + x155 := (x154 >> 51) + x156 := (x154 & 0x7ffffffffffff) + x157 := (x155 + x137) + x158 := uint1((x157 >> 51)) + x159 := (x157 & 0x7ffffffffffff) + x160 := (uint64(x158) + x142) + out1[0] = x156 + out1[1] = x159 + out1[2] = x160 + out1[3] = x147 + out1[4] = x152 +} + +func carryPow2kInlined(out1 *TightFieldElement, arg1 *LooseFieldElement, arg2 uint) { + a0, a1, a2, a3, a4 := arg1[0], arg1[1], arg1[2], arg1[3], arg1[4] + + for { + x1 := (a4 * 0x13) + x2 := (x1 * 0x2) + x3 := (a4 * 0x2) + x4 := (a3 * 0x13) + x5 := (x4 * 0x2) + x6 := (a3 * 0x2) + x7 := (a2 * 0x2) + x8 := (a1 * 0x2) + var x9 uint64 + var x10 uint64 + x10, x9 = bits.Mul64(a4, x1) + var x11 uint64 + var x12 uint64 + x12, x11 = bits.Mul64(a3, x2) + var x13 uint64 + var x14 uint64 + x14, x13 = bits.Mul64(a3, x4) + var x15 uint64 + var x16 uint64 + x16, x15 = bits.Mul64(a2, x2) + var x17 uint64 + var x18 uint64 + x18, x17 = bits.Mul64(a2, x5) + var x19 uint64 + var x20 uint64 + x20, x19 = bits.Mul64(a2, a2) + var x21 uint64 + var x22 uint64 + x22, x21 = bits.Mul64(a1, x2) + var x23 uint64 + var x24 uint64 + x24, x23 = bits.Mul64(a1, x6) + var x25 uint64 + var x26 uint64 + x26, x25 = bits.Mul64(a1, x7) + var x27 uint64 + var x28 uint64 + x28, x27 = bits.Mul64(a1, a1) + var x29 uint64 + var x30 uint64 + x30, x29 = bits.Mul64(a0, x3) + var x31 uint64 + var x32 uint64 + x32, x31 = bits.Mul64(a0, x6) + var x33 uint64 + var x34 uint64 + x34, x33 = bits.Mul64(a0, x7) + var x35 uint64 + var x36 uint64 + x36, x35 = bits.Mul64(a0, x8) + var x37 uint64 + var x38 uint64 + x38, x37 = bits.Mul64(a0, a0) + var x39 uint64 + var x40 uint64 + x39, x40 = bits.Add64(x21, x17, uint64(0x0)) + var x41 uint64 + x41, _ = bits.Add64(x22, x18, uint64(uint1(x40))) + var x43 uint64 + var x44 uint64 + x43, x44 = bits.Add64(x37, x39, uint64(0x0)) + var x45 uint64 + x45, _ = bits.Add64(x38, x41, uint64(uint1(x44))) + x47 := ((x43 >> 51) | ((x45 << 13) & 0xffffffffffffffff)) + x48 := (x43 & 0x7ffffffffffff) + var x49 uint64 + var x50 uint64 + x49, x50 = bits.Add64(x23, x19, uint64(0x0)) + var x51 uint64 + x51, _ = bits.Add64(x24, x20, uint64(uint1(x50))) + var x53 uint64 + var x54 uint64 + x53, x54 = bits.Add64(x29, x49, uint64(0x0)) + var x55 uint64 + x55, _ = bits.Add64(x30, x51, uint64(uint1(x54))) + var x57 uint64 + var x58 uint64 + x57, x58 = bits.Add64(x25, x9, uint64(0x0)) + var x59 uint64 + x59, _ = bits.Add64(x26, x10, uint64(uint1(x58))) + var x61 uint64 + var x62 uint64 + x61, x62 = bits.Add64(x31, x57, uint64(0x0)) + var x63 uint64 + x63, _ = bits.Add64(x32, x59, uint64(uint1(x62))) + var x65 uint64 + var x66 uint64 + x65, x66 = bits.Add64(x27, x11, uint64(0x0)) + var x67 uint64 + x67, _ = bits.Add64(x28, x12, uint64(uint1(x66))) + var x69 uint64 + var x70 uint64 + x69, x70 = bits.Add64(x33, x65, uint64(0x0)) + var x71 uint64 + x71, _ = bits.Add64(x34, x67, uint64(uint1(x70))) + var x73 uint64 + var x74 uint64 + x73, x74 = bits.Add64(x15, x13, uint64(0x0)) + var x75 uint64 + x75, _ = bits.Add64(x16, x14, uint64(uint1(x74))) + var x77 uint64 + var x78 uint64 + x77, x78 = bits.Add64(x35, x73, uint64(0x0)) + var x79 uint64 + x79, _ = bits.Add64(x36, x75, uint64(uint1(x78))) + var x81 uint64 + var x82 uint64 + x81, x82 = bits.Add64(x47, x77, uint64(0x0)) + x83 := (uint64(uint1(x82)) + x79) + x84 := ((x81 >> 51) | ((x83 << 13) & 0xffffffffffffffff)) + x85 := (x81 & 0x7ffffffffffff) + var x86 uint64 + var x87 uint64 + x86, x87 = bits.Add64(x84, x69, uint64(0x0)) + x88 := (uint64(uint1(x87)) + x71) + x89 := ((x86 >> 51) | ((x88 << 13) & 0xffffffffffffffff)) + x90 := (x86 & 0x7ffffffffffff) + var x91 uint64 + var x92 uint64 + x91, x92 = bits.Add64(x89, x61, uint64(0x0)) + x93 := (uint64(uint1(x92)) + x63) + x94 := ((x91 >> 51) | ((x93 << 13) & 0xffffffffffffffff)) + x95 := (x91 & 0x7ffffffffffff) + var x96 uint64 + var x97 uint64 + x96, x97 = bits.Add64(x94, x53, uint64(0x0)) + x98 := (uint64(uint1(x97)) + x55) + x99 := ((x96 >> 51) | ((x98 << 13) & 0xffffffffffffffff)) + x100 := (x96 & 0x7ffffffffffff) + x101 := (x99 * 0x13) + x102 := (x48 + x101) + x103 := (x102 >> 51) + x104 := (x102 & 0x7ffffffffffff) + x105 := (x103 + x85) + x106 := uint1((x105 >> 51)) + x107 := (x105 & 0x7ffffffffffff) + x108 := (uint64(x106) + x90) + a0 = x104 + a1 = x107 + a2 = x108 + a3 = x95 + a4 = x100 + + arg2-- + if arg2 == 0 { + break + } + } + + out1[0], out1[1], out1[2], out1[3], out1[4] = a0, a1, a2, a3, a4 +} + +func cmovznzU64(arg1 uint1, arg2 uint64, arg3 uint64) uint64 { + x1 := (uint64(arg1) * 0xffffffffffffffff) + return ((x1 & arg3) | ((^x1) & arg2)) +} + +func addcarryxU51(arg1 uint1, arg2 uint64, arg3 uint64) (out1 uint64, out2 uint1) { + x1 := ((uint64(arg1) + arg2) + arg3) + x2 := (x1 & 0x7ffffffffffff) + x3 := uint1((x1 >> 51)) + return x2, x3 +} + +func subborrowxU51(arg1 uint1, arg2 uint64, arg3 uint64) (out1 uint64, out2 uint1) { + x1 := ((int64(arg2) - int64(arg1)) - int64(arg3)) + x2 := int1((x1 >> 51)) + x3 := (uint64(x1) & 0x7ffffffffffff) + return x3, (0x0 - uint1(x2)) +} + +func toBytesInlined(out1 *[32]uint8, arg1 *TightFieldElement) { + x1, x2 := subborrowxU51(0x0, arg1[0], 0x7ffffffffffed) + x3, x4 := subborrowxU51(x2, arg1[1], 0x7ffffffffffff) + x5, x6 := subborrowxU51(x4, arg1[2], 0x7ffffffffffff) + x7, x8 := subborrowxU51(x6, arg1[3], 0x7ffffffffffff) + x9, x10 := subborrowxU51(x8, arg1[4], 0x7ffffffffffff) + x11 := cmovznzU64(x10, uint64(0x0), 0xffffffffffffffff) + x12, x13 := addcarryxU51(0x0, x1, (x11 & 0x7ffffffffffed)) + x14, x15 := addcarryxU51(x13, x3, (x11 & 0x7ffffffffffff)) + x16, x17 := addcarryxU51(x15, x5, (x11 & 0x7ffffffffffff)) + x18, x19 := addcarryxU51(x17, x7, (x11 & 0x7ffffffffffff)) + x20, _ := addcarryxU51(x19, x9, (x11 & 0x7ffffffffffff)) // x21 unused + x22 := (x20 << 4) + x23 := (x18 * uint64(0x2)) + x24 := (x16 << 6) + x25 := (x14 << 3) + x26 := (uint8(x12) & 0xff) + x27 := (x12 >> 8) + x28 := (uint8(x27) & 0xff) + x29 := (x27 >> 8) + x30 := (uint8(x29) & 0xff) + x31 := (x29 >> 8) + x32 := (uint8(x31) & 0xff) + x33 := (x31 >> 8) + x34 := (uint8(x33) & 0xff) + x35 := (x33 >> 8) + x36 := (uint8(x35) & 0xff) + x37 := uint8((x35 >> 8)) + x38 := (x25 + uint64(x37)) + x39 := (uint8(x38) & 0xff) + x40 := (x38 >> 8) + x41 := (uint8(x40) & 0xff) + x42 := (x40 >> 8) + x43 := (uint8(x42) & 0xff) + x44 := (x42 >> 8) + x45 := (uint8(x44) & 0xff) + x46 := (x44 >> 8) + x47 := (uint8(x46) & 0xff) + x48 := (x46 >> 8) + x49 := (uint8(x48) & 0xff) + x50 := uint8((x48 >> 8)) + x51 := (x24 + uint64(x50)) + x52 := (uint8(x51) & 0xff) + x53 := (x51 >> 8) + x54 := (uint8(x53) & 0xff) + x55 := (x53 >> 8) + x56 := (uint8(x55) & 0xff) + x57 := (x55 >> 8) + x58 := (uint8(x57) & 0xff) + x59 := (x57 >> 8) + x60 := (uint8(x59) & 0xff) + x61 := (x59 >> 8) + x62 := (uint8(x61) & 0xff) + x63 := (x61 >> 8) + x64 := (uint8(x63) & 0xff) + x65 := uint1((x63 >> 8)) + x66 := (x23 + uint64(x65)) + x67 := (uint8(x66) & 0xff) + x68 := (x66 >> 8) + x69 := (uint8(x68) & 0xff) + x70 := (x68 >> 8) + x71 := (uint8(x70) & 0xff) + x72 := (x70 >> 8) + x73 := (uint8(x72) & 0xff) + x74 := (x72 >> 8) + x75 := (uint8(x74) & 0xff) + x76 := (x74 >> 8) + x77 := (uint8(x76) & 0xff) + x78 := uint8((x76 >> 8)) + x79 := (x22 + uint64(x78)) + x80 := (uint8(x79) & 0xff) + x81 := (x79 >> 8) + x82 := (uint8(x81) & 0xff) + x83 := (x81 >> 8) + x84 := (uint8(x83) & 0xff) + x85 := (x83 >> 8) + x86 := (uint8(x85) & 0xff) + x87 := (x85 >> 8) + x88 := (uint8(x87) & 0xff) + x89 := (x87 >> 8) + x90 := (uint8(x89) & 0xff) + x91 := uint8((x89 >> 8)) + out1[0] = x26 + out1[1] = x28 + out1[2] = x30 + out1[3] = x32 + out1[4] = x34 + out1[5] = x36 + out1[6] = x39 + out1[7] = x41 + out1[8] = x43 + out1[9] = x45 + out1[10] = x47 + out1[11] = x49 + out1[12] = x52 + out1[13] = x54 + out1[14] = x56 + out1[15] = x58 + out1[16] = x60 + out1[17] = x62 + out1[18] = x64 + out1[19] = x67 + out1[20] = x69 + out1[21] = x71 + out1[22] = x73 + out1[23] = x75 + out1[24] = x77 + out1[25] = x80 + out1[26] = x82 + out1[27] = x84 + out1[28] = x86 + out1[29] = x88 + out1[30] = x90 + out1[31] = x91 +} diff --git a/internal/tony/tony_u32.go b/internal/tony/tony_u32.go new file mode 100644 index 0000000..1fc8a5c --- /dev/null +++ b/internal/tony/tony_u32.go @@ -0,0 +1,141 @@ +// Copyright (c) 2021 Oasis Labs Inc. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the copyright holder nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED +// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +//go:build (386 || arm || mips || mipsle || mips64le || mips64 || force32bit) && !force64bit +// +build 386 arm mips mipsle mips64le mips64 force32bit +// +build !force64bit + +package tony + +import fiat "github.com/mit-plv/fiat-crypto/fiat-go/32/curve25519" + +type ( + TightFieldElement fiat.TightFieldElement + LooseFieldElement fiat.LooseFieldElement +) + +func (tfe *TightFieldElement) CarryMul(arg1, arg2 *LooseFieldElement) *TightFieldElement { + fiat.CarryMul((*fiat.TightFieldElement)(tfe), (*fiat.LooseFieldElement)(arg1), (*fiat.LooseFieldElement)(arg2)) + return tfe +} + +func (tfe *TightFieldElement) CarryMulAdd(arg1 *LooseFieldElement, arg2, arg3 *TightFieldElement) *TightFieldElement { + var sum LooseFieldElement + sum.Add(arg2, arg3) + return tfe.CarryMul(arg1, &sum) +} + +func (tfe *TightFieldElement) CarryMulSub(arg1 *LooseFieldElement, arg2, arg3 *TightFieldElement) *TightFieldElement { + var diff LooseFieldElement + diff.Sub(arg2, arg3) + return tfe.CarryMul(arg1, &diff) +} + +func (tfe *TightFieldElement) CarrySquare(arg1 *LooseFieldElement) *TightFieldElement { + fiat.CarrySquare((*fiat.TightFieldElement)(tfe), (*fiat.LooseFieldElement)(arg1)) + return tfe +} + +func (tfe *TightFieldElement) CarrySquareAdd(arg1, arg2 *TightFieldElement) *TightFieldElement { + lfe := tfe.RelaxCast() + lfe.Add(arg1, arg2) + return tfe.CarrySquare(lfe) +} + +func (tfe *TightFieldElement) CarrySquareSub(arg1, arg2 *TightFieldElement) *TightFieldElement { + lfe := tfe.RelaxCast() + lfe.Sub(arg1, arg2) + return tfe.CarrySquare(lfe) +} + +func (tfe *TightFieldElement) CarryPow2k(arg1 *LooseFieldElement, k uint) *TightFieldElement { + if k == 0 { + panic("internal/tony: k out of bounds") + } + + tfe.CarrySquare(arg1) + for i := uint(1); i < k; i++ { + tfe.CarrySquare(tfe.RelaxCast()) + } + + return tfe +} + +func (tfe *TightFieldElement) CarryScmul121666(arg1 *LooseFieldElement) *TightFieldElement { + fiat.CarryScmul121666((*fiat.TightFieldElement)(tfe), (*fiat.LooseFieldElement)(arg1)) + return tfe +} + +func (tfe *TightFieldElement) Carry(arg1 *LooseFieldElement) *TightFieldElement { + fiat.Carry((*fiat.TightFieldElement)(tfe), (*fiat.LooseFieldElement)(arg1)) + return tfe +} + +func (tfe *TightFieldElement) CarryAdd(arg1, arg2 *TightFieldElement) *TightFieldElement { + fiat.CarryAdd((*fiat.TightFieldElement)(tfe), (*fiat.TightFieldElement)(arg1), (*fiat.TightFieldElement)(arg2)) + return tfe +} + +func (tfe *TightFieldElement) CarrySub(arg1, arg2 *TightFieldElement) *TightFieldElement { + fiat.CarrySub((*fiat.TightFieldElement)(tfe), (*fiat.TightFieldElement)(arg1), (*fiat.TightFieldElement)(arg2)) + return tfe +} + +func (tfe *TightFieldElement) CarryOpp(arg1 *TightFieldElement) *TightFieldElement { + fiat.CarryOpp((*fiat.TightFieldElement)(tfe), (*fiat.TightFieldElement)(arg1)) + return tfe +} + +func (tfe *TightFieldElement) ToBytes(out1 *[32]uint8) { + fiat.ToBytes(out1, (*fiat.TightFieldElement)(tfe)) +} + +func (tfe *TightFieldElement) FromBytes(arg1 *[32]uint8) { + fiat.FromBytes((*fiat.TightFieldElement)(tfe), arg1) +} + +func (lfe *LooseFieldElement) Add(arg1, arg2 *TightFieldElement) *LooseFieldElement { + fiat.Add((*fiat.LooseFieldElement)(lfe), (*fiat.TightFieldElement)(arg1), (*fiat.TightFieldElement)(arg2)) + return lfe +} + +func (lfe *LooseFieldElement) Sub(arg1, arg2 *TightFieldElement) *LooseFieldElement { + fiat.Sub((*fiat.LooseFieldElement)(lfe), (*fiat.TightFieldElement)(arg1), (*fiat.TightFieldElement)(arg2)) + return lfe +} + +func (lfe *LooseFieldElement) Opp(arg1 *TightFieldElement) *LooseFieldElement { + fiat.Opp((*fiat.LooseFieldElement)(lfe), (*fiat.TightFieldElement)(arg1)) + return lfe +} + +// Uint8ToLimb converts from a uint8 to a limb. +func Uint8ToLimb(i uint8) uint32 { + return (uint32)(i) +} diff --git a/internal/tony/tony_u64.go b/internal/tony/tony_u64.go new file mode 100644 index 0000000..33b2f86 --- /dev/null +++ b/internal/tony/tony_u64.go @@ -0,0 +1,135 @@ +// Copyright (c) 2021 Oasis Labs Inc. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the copyright holder nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED +// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +//go:build (amd64 || arm64 || ppc64le || ppc64 || s390x || force64bit) && !force32bit +// +build amd64 arm64 ppc64le ppc64 s390x force64bit +// +build !force32bit + +package tony + +import fiat "github.com/mit-plv/fiat-crypto/fiat-go/64/curve25519" + +type ( + TightFieldElement fiat.TightFieldElement + LooseFieldElement fiat.LooseFieldElement +) + +func (tfe *TightFieldElement) CarryMul(arg1, arg2 *LooseFieldElement) *TightFieldElement { + fiat.CarryMul((*fiat.TightFieldElement)(tfe), (*fiat.LooseFieldElement)(arg1), (*fiat.LooseFieldElement)(arg2)) + return tfe +} + +func (tfe *TightFieldElement) CarryMulAdd(arg1 *LooseFieldElement, arg2, arg3 *TightFieldElement) *TightFieldElement { + carryMulAddInlined(tfe, arg1, arg2, arg3) + return tfe +} + +func (tfe *TightFieldElement) CarryMulSub(arg1 *LooseFieldElement, arg2, arg3 *TightFieldElement) *TightFieldElement { + carryMulSubInlined(tfe, arg1, arg2, arg3) + return tfe +} + +func (tfe *TightFieldElement) CarrySquare(arg1 *LooseFieldElement) *TightFieldElement { + fiat.CarrySquare((*fiat.TightFieldElement)(tfe), (*fiat.LooseFieldElement)(arg1)) + return tfe +} + +func (tfe *TightFieldElement) CarrySquareAdd(arg1, arg2 *TightFieldElement) *TightFieldElement { + lfe := tfe.RelaxCast() + lfe.Add(arg1, arg2) + return tfe.CarrySquare(lfe) +} + +func (tfe *TightFieldElement) CarrySquareSub(arg1, arg2 *TightFieldElement) *TightFieldElement { + lfe := tfe.RelaxCast() + lfe.Sub(arg1, arg2) + return tfe.CarrySquare(lfe) +} + +func (tfe *TightFieldElement) CarryPow2k(arg1 *LooseFieldElement, k uint) *TightFieldElement { + if k == 0 { + panic("internal/tony: k out of bounds") + } + + carryPow2kInlined(tfe, arg1, k) + return tfe +} + +func (tfe *TightFieldElement) CarryScmul121666(arg1 *LooseFieldElement) *TightFieldElement { + fiat.CarryScmul121666((*fiat.TightFieldElement)(tfe), (*fiat.LooseFieldElement)(arg1)) + return tfe +} + +func (tfe *TightFieldElement) Carry(arg1 *LooseFieldElement) *TightFieldElement { + fiat.Carry((*fiat.TightFieldElement)(tfe), (*fiat.LooseFieldElement)(arg1)) + return tfe +} + +func (tfe *TightFieldElement) CarryAdd(arg1, arg2 *TightFieldElement) *TightFieldElement { + fiat.CarryAdd((*fiat.TightFieldElement)(tfe), (*fiat.TightFieldElement)(arg1), (*fiat.TightFieldElement)(arg2)) + return tfe +} + +func (tfe *TightFieldElement) CarrySub(arg1, arg2 *TightFieldElement) *TightFieldElement { + fiat.CarrySub((*fiat.TightFieldElement)(tfe), (*fiat.TightFieldElement)(arg1), (*fiat.TightFieldElement)(arg2)) + return tfe +} + +func (tfe *TightFieldElement) CarryOpp(arg1 *TightFieldElement) *TightFieldElement { + fiat.CarryOpp((*fiat.TightFieldElement)(tfe), (*fiat.TightFieldElement)(arg1)) + return tfe +} + +func (tfe *TightFieldElement) ToBytes(out1 *[32]uint8) { + toBytesInlined(out1, tfe) +} + +func (tfe *TightFieldElement) FromBytes(arg1 *[32]uint8) { + fiat.FromBytes((*fiat.TightFieldElement)(tfe), arg1) +} + +func (lfe *LooseFieldElement) Add(arg1, arg2 *TightFieldElement) *LooseFieldElement { + fiat.Add((*fiat.LooseFieldElement)(lfe), (*fiat.TightFieldElement)(arg1), (*fiat.TightFieldElement)(arg2)) + return lfe +} + +func (lfe *LooseFieldElement) Sub(arg1, arg2 *TightFieldElement) *LooseFieldElement { + fiat.Sub((*fiat.LooseFieldElement)(lfe), (*fiat.TightFieldElement)(arg1), (*fiat.TightFieldElement)(arg2)) + return lfe +} + +func (lfe *LooseFieldElement) Opp(arg1 *TightFieldElement) *LooseFieldElement { + fiat.Opp((*fiat.LooseFieldElement)(lfe), (*fiat.TightFieldElement)(arg1)) + return lfe +} + +// Uint8ToLimb converts from a uint8 to a limb. +func Uint8ToLimb(i uint8) uint64 { + return (uint64)(i) +} From ee44f25eac1ac8d9c552dc090cdc1f9c9a337418 Mon Sep 17 00:00:00 2001 From: Yawning Angel Date: Mon, 9 Aug 2021 07:54:07 +0000 Subject: [PATCH 2/3] internal/field: Use fiat-crypto --- README.md | 6 +- curve/constants_u32.go | 3 - curve/constants_u64.go | 3 - curve/edwards_vector_amd64.go | 11 + curve/models.go | 29 +- curve/montgomery.go | 54 ++- internal/asm/amd64/common.go | 2 - internal/asm/amd64/field_u64.go | 468 -------------------- internal/asm/amd64/gen.sh | 1 - internal/field/field.go | 160 ++++++- internal/field/field_u32.go | 441 +------------------ internal/field/field_u64.go | 578 +------------------------ internal/field/field_u64_amd64.go | 39 -- internal/field/field_u64_amd64.s | 401 ----------------- internal/field/field_u64_amd64_test.go | 191 -------- internal/field/field_u64_generic.go | 42 -- 16 files changed, 216 insertions(+), 2213 deletions(-) delete mode 100644 internal/asm/amd64/field_u64.go delete mode 100644 internal/field/field_u64_amd64.go delete mode 100644 internal/field/field_u64_amd64.s delete mode 100644 internal/field/field_u64_amd64_test.go delete mode 100644 internal/field/field_u64_generic.go diff --git a/README.md b/README.md index 55925ad..fa8a783 100644 --- a/README.md +++ b/README.md @@ -101,6 +101,8 @@ fault of the curve25519-voi developers alone. * The majority of curve25519-voi is derived from curve25519-dalek. + * fiat-crypto is used for the field arithmetic. + * The Ed25519 batch verification started off as a port of the implementation present in ed25519-dalek, but was later switched to be based off ed25519consensus. @@ -110,10 +112,6 @@ fault of the curve25519-voi developers alone. additional inspiration taken from Thomas Pornin's paper and curve9767 implementation. - * The assembly optimized field element multiplications were taken - (with minor modifications) from George Tankersley's ristretto255 - package. - * The Elligator 2 mapping was taken from Loup Vaillant's Monocypher package. diff --git a/curve/constants_u32.go b/curve/constants_u32.go index 6985f78..a07afab 100644 --- a/curve/constants_u32.go +++ b/curve/constants_u32.go @@ -190,9 +190,6 @@ var constINVSQRT_A_MINUS_D = field.NewElement2625( 6111466, 4156064, 39310137, 12243467, 41204824, 120896, 20826367, 26493656, 6093567, 31568420, ) -// `APLUS2_OVER_FOUR` is (A+2)/4. (This is used internally within the Montgomery ladder.) -var constAPLUS2_OVER_FOUR = field.NewElement2625(121666, 0, 0, 0, 0, 0, 0, 0, 0, 0) - // `[2^128]B` var constB_SHL_128 = newEdwardsPoint( field.NewElement2625( diff --git a/curve/constants_u64.go b/curve/constants_u64.go index f30ad3a..2c2e73d 100644 --- a/curve/constants_u64.go +++ b/curve/constants_u64.go @@ -272,9 +272,6 @@ var constINVSQRT_A_MINUS_D = field.NewElement51( 2118520810568447, ) -// `APLUS2_OVER_FOUR` is (A+2)/4. (This is used internally within the Montgomery ladder.) -var constAPLUS2_OVER_FOUR = field.NewElement51(121666, 0, 0, 0, 0) - // `[2^128]B` var constB_SHL_128 = newEdwardsPoint( field.NewElement51( diff --git a/curve/edwards_vector_amd64.go b/curve/edwards_vector_amd64.go index 3505950..f78bacb 100644 --- a/curve/edwards_vector_amd64.go +++ b/curve/edwards_vector_amd64.go @@ -116,6 +116,17 @@ type extendedPoint struct { func (p *EdwardsPoint) setExtended(ep *extendedPoint) *EdwardsPoint { ep.inner.Split(&p.inner.X, &p.inner.Y, &p.inner.Z, &p.inner.T) + + // Ensure that X, Y, Z, T are fully reduced. + // + // In practice this is probably not required, but there is a + // difference between extendedPoint (b < 0.007) and TightFieldElement + // ([0x0 ~> 0x8000000000000])'s bounds. + p.inner.X.StrictReduce() + p.inner.Y.StrictReduce() + p.inner.Z.StrictReduce() + p.inner.T.StrictReduce() + return p } diff --git a/curve/models.go b/curve/models.go index 3380daa..5e785a9 100644 --- a/curve/models.go +++ b/curve/models.go @@ -179,8 +179,7 @@ func (p *completedPoint) Double(pp *projectivePoint) *completedPoint { XX.Square(&pp.X) YY.Square(&pp.Y) ZZ2.Square2(&pp.Z) - X_plus_Y_sq.Add(&pp.X, &pp.Y) // X+Y - X_plus_Y_sq.Square(&X_plus_Y_sq) // (X+Y)^2 + X_plus_Y_sq.SquareAdd(&pp.X, &pp.Y) p.Y.Add(&YY, &XX) p.X.Sub(&X_plus_Y_sq, &p.Y) @@ -192,10 +191,8 @@ func (p *completedPoint) Double(pp *projectivePoint) *completedPoint { func (p *completedPoint) AddEdwardsProjectiveNiels(a *EdwardsPoint, b *projectiveNielsPoint) *completedPoint { var PP, MM, TT2d, ZZ, ZZ2 field.Element - PP.Add(&a.inner.Y, &a.inner.X) // a.Y + a.X - PP.Mul(&PP, &b.Y_plus_X) // (a.Y + a.X) * b.Y_plus_X - MM.Sub(&a.inner.Y, &a.inner.X) // a.Y - a.X - MM.Mul(&MM, &b.Y_minus_X) // (a.Y - a.X) * b.Y_minus_X + PP.MulAdd(&b.Y_plus_X, &a.inner.Y, &a.inner.X) + MM.MulSub(&b.Y_minus_X, &a.inner.Y, &a.inner.X) TT2d.Mul(&a.inner.T, &b.T2d) ZZ.Mul(&a.inner.Z, &b.Z) ZZ2.Add(&ZZ, &ZZ) @@ -215,10 +212,8 @@ func (p *completedPoint) AddCompletedProjectiveNiels(a *completedPoint, b *proje func (p *completedPoint) SubEdwardsProjectiveNiels(a *EdwardsPoint, b *projectiveNielsPoint) *completedPoint { var PM, MP, TT2d, ZZ, ZZ2 field.Element - PM.Add(&a.inner.Y, &a.inner.X) // a.Y + a.X - PM.Mul(&PM, &b.Y_minus_X) // (a.Y + a.X) * b.Y_minus_X - MP.Sub(&a.inner.Y, &a.inner.X) // a.Y - a.X - MP.Mul(&MP, &b.Y_plus_X) // (a.Y - a.X) * b.Y_plus_X + PM.MulAdd(&b.Y_minus_X, &a.inner.Y, &a.inner.X) + MP.MulSub(&b.Y_plus_X, &a.inner.Y, &a.inner.X) TT2d.Mul(&a.inner.T, &b.T2d) ZZ.Mul(&a.inner.Z, &b.Z) ZZ2.Add(&ZZ, &ZZ) @@ -237,10 +232,8 @@ func (p *completedPoint) SubCompletedProjectiveNiels(a *completedPoint, b *proje func (p *completedPoint) AddEdwardsAffineNiels(a *EdwardsPoint, b *affineNielsPoint) *completedPoint { var PP, MM, Txy2d, Z2 field.Element - PP.Add(&a.inner.Y, &a.inner.X) // a.Y + a.X - PP.Mul(&PP, &b.y_plus_x) // (a.Y + a.X) * b.y_plus_x - MM.Sub(&a.inner.Y, &a.inner.X) // a.Y - a.X - MM.Mul(&MM, &b.y_minus_x) // (a.Y - a.X) * b.y_minus_x + PP.MulAdd(&b.y_plus_x, &a.inner.Y, &a.inner.X) + MM.MulSub(&b.y_minus_x, &a.inner.Y, &a.inner.X) Txy2d.Mul(&a.inner.T, &b.xy2d) Z2.Add(&a.inner.Z, &a.inner.Z) @@ -258,11 +251,9 @@ func (p *completedPoint) AddCompletedAffineNiels(a *completedPoint, b *affineNie } func (p *completedPoint) SubEdwardsAffineNiels(a *EdwardsPoint, b *affineNielsPoint) *completedPoint { - var Y_plus_X, Y_minus_X, PM, MP, Txy2d, Z2 field.Element - Y_plus_X.Add(&a.inner.Y, &a.inner.X) - Y_minus_X.Sub(&a.inner.Y, &a.inner.X) - PM.Mul(&Y_plus_X, &b.y_minus_x) - MP.Mul(&Y_minus_X, &b.y_plus_x) + var PM, MP, Txy2d, Z2 field.Element + PM.MulAdd(&b.y_minus_x, &a.inner.Y, &a.inner.X) + MP.MulSub(&b.y_plus_x, &a.inner.Y, &a.inner.X) Txy2d.Mul(&a.inner.T, &b.xy2d) Z2.Add(&a.inner.Z, &a.inner.Z) diff --git a/curve/montgomery.go b/curve/montgomery.go index 0008642..97a2da6 100644 --- a/curve/montgomery.go +++ b/curve/montgomery.go @@ -36,6 +36,7 @@ import ( "github.com/oasisprotocol/curve25519-voi/curve/scalar" "github.com/oasisprotocol/curve25519-voi/internal/field" + "github.com/oasisprotocol/curve25519-voi/internal/tony" ) var errUCoordinateOnTwist = fmt.Errorf("curve/montgomery: Montgomery u-coordinate is on twist") @@ -127,40 +128,45 @@ func NewMontgomeryPoint() *MontgomeryPoint { } func montgomeryDifferentialAddAndDouble(P, Q *montgomeryProjectivePoint, affine_PmQ *field.Element) { - var t0, t1, t2, t3 field.Element - t0.Add(&P.U, &P.W) - t1.Sub(&P.U, &P.W) - t2.Add(&Q.U, &Q.W) - t3.Sub(&Q.U, &Q.W) + // Using the fiat-crypto routines directly can shave off reductions, + // so we do that. - var t4, t5 field.Element - t4.Square(&t0) // (U_P + W_P)^2 = U_P^2 + 2 U_P W_P + W_P^2 - t5.Square(&t1) // (U_P - W_P)^2 = U_P^2 - 2 U_P W_P + W_P^2 + p_U, p_W := P.U.UnsafeInner(), P.W.UnsafeInner() + q_U, q_W := Q.U.UnsafeInner(), Q.W.UnsafeInner() - var t6 field.Element - t6.Sub(&t4, &t5) // 4 U_P W_P + var t0, t1, t2, t3 tony.LooseFieldElement + t0.Add(p_U, p_W) + t1.Sub(p_U, p_W) + t2.Add(q_U, q_W) + t3.Sub(q_U, q_W) - var t7, t8 field.Element - t7.Mul(&t0, &t3) // (U_P + W_P) (U_Q - W_Q) = U_P U_Q + W_P U_Q - U_P W_Q - W_P W_Q - t8.Mul(&t1, &t2) // (U_P - W_P) (U_Q + W_Q) = U_P U_Q - W_P U_Q + U_P W_Q - W_P W_Q + var t4, t5 tony.TightFieldElement + t4.CarrySquare(&t0) // (U_P + W_P)^2 = U_P^2 + 2 U_P W_P + W_P^2 + t5.CarrySquare(&t1) // (U_P - W_P)^2 = U_P^2 - 2 U_P W_P + W_P^2 - // Note: dalek uses even more temporary variables, but eliminating them - // is slightly faster since the Go compiler won't do that for us. + var t6 tony.LooseFieldElement + t6.Sub(&t4, &t5) // 4 U_P W_P - Q.U.Add(&t7, &t8) // 2 (U_P U_Q - W_P W_Q): t9 - Q.W.Sub(&t7, &t8) // 2 (W_P U_Q - U_P W_Q): t10 + var t7, t8 tony.TightFieldElement + t7.CarryMul(&t0, &t3) // (U_P + W_P) (U_Q - W_Q) = U_P U_Q + W_P U_Q - U_P W_Q - W_P W_Q + t8.CarryMul(&t1, &t2) // (U_P - W_P) (U_Q + W_Q) = U_P U_Q - W_P U_Q + U_P W_Q - W_P W_Q - Q.U.Square(&Q.U) // 4 (U_P U_Q - W_P W_Q)^2: t11 - Q.W.Square(&Q.W) // 4 (W_P U_Q - U_P W_Q)^2: t12 + // 2 (U_P U_Q - W_P W_Q): t9 + // 2 (W_P U_Q - U_P W_Q): t10 + // 4 (U_P U_Q - W_P W_Q)^2: t11 + // 4 (W_P U_Q - U_P W_Q)^2: t12 + q_U.CarrySquareAdd(&t7, &t8) + q_W.CarrySquareSub(&t7, &t8) - P.W.Mul(&constAPLUS2_OVER_FOUR, &t6) // (A + 2) U_P U_Q: t13 + p_W.CarryScmul121666(&t6) // (A + 2) U_P U_Q: t13 - P.U.Mul(&t4, &t5) // ((U_P + W_P)(U_P - W_P))^2 = (U_P^2 - W_P^2)^2: t14 - P.W.Add(&P.W, &t5) // (U_P - W_P)^2 + (A + 2) U_P W_P: t15 + p_U.CarryMul(t4.RelaxCast(), t5.RelaxCast()) // ((U_P + W_P)(U_P - W_P))^2 = (U_P^2 - W_P^2)^2: t14 - P.W.Mul(&t6, &P.W) // 4 (U_P W_P) ((U_P - W_P)^2 + (A + 2) U_P W_P): t16 + // (U_P - W_P)^2 + (A + 2) U_P W_P: t15 + // 4 (U_P W_P) ((U_P - W_P)^2 + (A + 2) U_P W_P): t16 + p_W.CarryMulAdd(&t6, p_W, &t5) - Q.W.Mul(affine_PmQ, &Q.W) // U_D * 4 (W_P U_Q - U_P W_Q)^2: t17 + q_W.CarryMul(affine_PmQ.UnsafeInner().RelaxCast(), q_W.RelaxCast()) // U_D * 4 (W_P U_Q - U_P W_Q)^2: t17 // t18 := t11 // W_D * 4 (U_P U_Q - W_P W_Q)^2: t18 // P.U = t14 // U_{P'} = (U_P + W_P)^2 (U_P - W_P)^2 diff --git a/internal/asm/amd64/common.go b/internal/asm/amd64/common.go index 2403ff2..6ee3aaf 100644 --- a/internal/asm/amd64/common.go +++ b/internal/asm/amd64/common.go @@ -44,8 +44,6 @@ type ( affineNielsPointLookupTable struct{} affineNielsPoint struct{} - Element struct{} - cachedPointLookupTable struct{} cachedPoint struct{} extendedPoint struct{} diff --git a/internal/asm/amd64/field_u64.go b/internal/asm/amd64/field_u64.go deleted file mode 100644 index f9742c6..0000000 --- a/internal/asm/amd64/field_u64.go +++ /dev/null @@ -1,468 +0,0 @@ -// Copyright (c) 2017 George Tankersley. All rights reserved. -// Copyright (c) 2020-2021 Oasis Labs Inc. All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the copyright holder nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS -// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED -// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A -// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED -// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -// +build ignore - -package main - -import ( - "fmt" - "os" - - . "github.com/mmcloughlin/avo/build" - . "github.com/mmcloughlin/avo/operand" - . "github.com/mmcloughlin/avo/reg" -) - -func main() { - for i, step := range []func() error{ - SetCommon, - FeMul, - FePow2k, - } { - if err := step(); err != nil { - fmt.Printf("step %d failed: %v", i, err) - os.Exit(1) - } - } - - Generate() -} - -// These routines are based on the ristretto255 package by -// George Tankersley. The changes made are as follows: -// -// * Abstract out the reduction and carry into common code. -// * Instead of feSquare, implement fePow2k. -// * Use avo, because all the cool kids are doing it. - -func reduce64(r00, r01, r10, r11, r20, r21, r30, r31, r40, r41 GPVirtual) { - Comment("Reduce") - MOVQ(U64((1<<51)-1), RAX) // rax <-- mask51 - mask51 := RAX - - SHLQ(Imm(13), r00, r01) // r01 = shld with r00 - ANDQ(mask51, r00) // r00 &= mask51 - SHLQ(Imm(13), r10, r11) // r11 = shld with r10 - ANDQ(mask51, r10) // r10 &= mask51 - ADDQ(r01, r10) // r10 += r01 - SHLQ(Imm(13), r20, r21) // r21 = shld with r20 - ANDQ(mask51, r20) // r20 &= mask51 - ADDQ(r11, r20) // r20 += r11 - SHLQ(Imm(13), r30, r31) // r31 = shld with r30 - ANDQ(mask51, r30) // r30 &= mask51 - ADDQ(r21, r30) // r30 += r21 - SHLQ(Imm(13), r40, r41) // r41 = shld with r40 - ANDQ(mask51, r40) // r40 &= mask51 - ADDQ(r31, r40) // r40 += r31 - IMUL3Q(Imm(19), r41, r41) // r41 *= 19 - ADDQ(r41, r00) // r00 += r41 - - t0, t1, t2, t3, t4 := r01, r11, r21, r31, r41 - MOVQ(r00, t0) // t0 <-- r00 - MOVQ(r10, t1) // t1 <-- r10 - MOVQ(r20, t2) // t2 <-- r20 - MOVQ(r30, t3) // t3 <-- r30 - MOVQ(r40, t4) // t4 <-- r40 - ANDQ(mask51, r00) // r00 &= mask51 - ANDQ(mask51, r10) // r10 &= mask51 - ANDQ(mask51, r20) // r20 &= mask51 - ANDQ(mask51, r30) // r30 &= mask51 - ANDQ(mask51, r40) // r40 &= mask51 - SHRQ(Imm(51), t0) // t0 <- r00 >> 51 - SHRQ(Imm(51), t1) // t1 <- r10 >> 51 - SHRQ(Imm(51), t2) // t2 <- r20 >> 51 - SHRQ(Imm(51), t3) // t3 <- r30 >> 51 - SHRQ(Imm(51), t4) // t4 <- r40 >> 51 - IMUL3Q(Imm(19), t4, t4) // t4 <-- (r40 >> 51) * 19 - ADDQ(t0, r10) // r10 += t0 - ADDQ(t1, r20) // r20 += t1 - ADDQ(t2, r30) // r30 += t2 - ADDQ(t3, r40) // r40 += t3 - ADDQ(t4, r00) // r10 += t4 -} - -func FeMul() error { - TEXT( - "feMul", - NOSPLIT|NOFRAME, - "func(out, a, b *Element)", - ) - - Comment( - "Based on assembly generated by PeachPy. Equivalent to the Go in", - "feMulGeneric, which was originally based on the amd64-51-30k", - "assembly in SUPERCOP.", - ) - - a := Mem{Base: Load(Param("a"), GP64())} - b := Mem{Base: Load(Param("b"), GP64())} - - x0, x1, x2, x3, x4 := a.Offset(0), a.Offset(8), a.Offset(16), a.Offset(24), a.Offset(32) - y0, y1, y2, y3, y4 := b.Offset(0), b.Offset(8), b.Offset(16), b.Offset(24), b.Offset(32) - - Comment("r0 = x0*y0 + x1_19*y4 + x2_19*y3 + x3_19*y2 + x4_19*y1\n") - r00, r01 := GP64(), GP64() - - Comment("r00, r01 = x0*y0") - MOVQ(x0, RAX) - MULQ(y0) - MOVQ(RAX, r00) - MOVQ(RDX, r01) - - Comment("r00, r01 += x1_19*y4") - MOVQ(x1, RDX) - IMUL3Q(Imm(19), RDX, RAX) - MULQ(y4) - ADDQ(RAX, r00) - ADCQ(RDX, r01) - - Comment("r00, r01 += x2_19*y3") - MOVQ(x2, RDX) - IMUL3Q(Imm(19), RDX, RAX) - MULQ(y3) - ADDQ(RAX, r00) - ADCQ(RDX, r01) - - Comment("r00, r01 += x3_19*y2") - MOVQ(x3, RDX) - IMUL3Q(Imm(19), RDX, RAX) - MULQ(y2) - ADDQ(RAX, r00) - ADCQ(RDX, r01) - - Comment("r00, r01 += x4_19*y1") - MOVQ(x4, RDX) - IMUL3Q(Imm(19), RDX, RAX) - MULQ(y1) - ADDQ(RAX, r00) - ADCQ(RDX, r01) - - Comment("r1 = x0*y1 + x1*y0 + x2_19*y4 + x3_19*y3 + x4_19*y2\n") - r10, r11 := GP64(), GP64() - - Comment("r10, r11 = x0*y1") - MOVQ(x0, RAX) - MULQ(y1) - MOVQ(RAX, r10) - MOVQ(RDX, r11) - - Comment("r10, r11 += x1*y0") - MOVQ(x1, RAX) - MULQ(y0) - ADDQ(RAX, r10) - ADCQ(RDX, r11) - - Comment("r10, r11 += x2_19*y4") - MOVQ(x2, RDX) - IMUL3Q(Imm(19), RDX, RAX) - MULQ(y4) - ADDQ(RAX, r10) - ADCQ(RDX, r11) - - Comment("r10, r11 += x3_19*y3") - MOVQ(x3, RDX) - IMUL3Q(Imm(19), RDX, RAX) - MULQ(y3) - ADDQ(RAX, r10) - ADCQ(RDX, r11) - - Comment("r10, r11 += x4_19*y2") - MOVQ(x4, RDX) - IMUL3Q(Imm(19), RDX, RAX) - MULQ(y2) - ADDQ(RAX, r10) - ADCQ(RDX, r11) - - Comment("r2 = x0*y2 + x1*y1 + x2*y0 + x3_19*y4 + x4_19*y3\n") - r20, r21 := GP64(), GP64() - - Comment("r20, r11 = x0*y2") - MOVQ(x0, RAX) - MULQ(y2) - MOVQ(RAX, r20) - MOVQ(RDX, r21) - - Comment("r20, r21 += x1*y1") - MOVQ(x1, RAX) - MULQ(y1) - ADDQ(RAX, r20) - ADCQ(RDX, r21) - - Comment("r20, r21 += x2*y0") - MOVQ(x2, RAX) - MULQ(y0) - ADDQ(RAX, r20) - ADCQ(RDX, r21) - - Comment("r20, r21 += x3_19*y4") - MOVQ(x3, RDX) - IMUL3Q(Imm(19), RDX, RAX) - MULQ(y4) - ADDQ(RAX, r20) - ADCQ(RDX, r21) - - Comment("r20, r21 += x4_19*y3") - MOVQ(x4, RDX) - IMUL3Q(Imm(19), RDX, RAX) - MULQ(y3) - ADDQ(RAX, r20) - ADCQ(RDX, r21) - - Comment("r3 = x0*y3 + x1*y2 + x2*y1 + x3*y0 + x4_19*y4\n") - r30, r31 := GP64(), GP64() - - Comment("r30, r31 = x0*y3") - MOVQ(x0, RAX) - MULQ(y3) - MOVQ(RAX, r30) - MOVQ(RDX, r31) - - Comment("r30, r31 += x1*y2") - MOVQ(x1, RAX) - MULQ(y2) - ADDQ(RAX, r30) - ADCQ(RDX, r31) - - Comment("r30, r31 += x2*y1") - MOVQ(x2, RAX) - MULQ(y1) - ADDQ(RAX, r30) - ADCQ(RDX, r31) - - Comment("r30, r31 += x3*y0") - MOVQ(x3, RAX) - MULQ(y0) - ADDQ(RAX, r30) - ADCQ(RDX, r31) - - Comment("r30, r31 += x4_19*y4") - MOVQ(x4, RDX) - IMUL3Q(Imm(19), RDX, RAX) - MULQ(y4) - ADDQ(RAX, r30) - ADCQ(RDX, r31) - - Comment("r3 = x0*y4 + x1*y3 + x2*y2 + x3*y1 + x4*y0\n") - r40, r41 := GP64(), GP64() - - Comment("r40, r31 = x0*y4") - MOVQ(x0, RAX) - MULQ(y4) - MOVQ(RAX, r40) - MOVQ(RDX, r41) - - Comment("r40, r41 += x1*y3") - MOVQ(x1, RAX) - MULQ(y3) - ADDQ(RAX, r40) - ADCQ(RDX, r41) - - Comment("r40, r41 += x2*y2") - MOVQ(x2, RAX) - MULQ(y2) - ADDQ(RAX, r40) - ADCQ(RDX, r41) - - Comment("r40, r41 += x3*y1") - MOVQ(x3, RAX) - MULQ(y1) - ADDQ(RAX, r40) - ADCQ(RDX, r41) - - Comment("r40, r41 += x4*y0") - MOVQ(x4, RAX) - MULQ(y0) - ADDQ(RAX, r40) - ADCQ(RDX, r41) - - reduce64(r00, r01, r10, r11, r20, r21, r30, r31, r40, r41) - - Comment("Write out the results") - out := Mem{Base: Load(Param("out"), GP64())} - MOVQ(r00, out.Offset(0)) - MOVQ(r10, out.Offset(8)) - MOVQ(r20, out.Offset(16)) - MOVQ(r30, out.Offset(24)) - MOVQ(r40, out.Offset(32)) - - RET() - - return nil -} - -func FePow2k() error { - TEXT( - "fePow2k", - NOSPLIT|NOFRAME, - "func(out, a *Element, k uint64)", - ) - - a := Mem{Base: Load(Param("a"), GP64())} - k := Load(Param("k"), GP64()) - - x0, x1, x2, x3, x4 := a.Offset(0), a.Offset(8), a.Offset(16), a.Offset(24), a.Offset(32) - - Label("pow2k_loop") - - Comment("r0 = x0*x0 + x1*38*x4 + x2*38*x3\n") - r00, r01 := GP64(), GP64() - - Comment("r00, r01 = x0*x0") - MOVQ(x0, RAX) - MULQ(x0) - MOVQ(RAX, r00) - MOVQ(RDX, r01) - - Comment("r00, r01 += x1*38*x4") - MOVQ(x1, RDX) - IMUL3Q(Imm(38), RDX, RAX) - MULQ(x4) - ADDQ(RAX, r00) - ADCQ(RDX, r01) - - Comment("r00, r01 += x2*38*x3") - MOVQ(x2, RDX) - IMUL3Q(Imm(38), RDX, RAX) - MULQ(x3) - ADDQ(RAX, r00) - ADCQ(RDX, r01) - - Comment("r1 = x0*2*x1 + x2*38*x4 + x3*19*x3\n") - r10, r11 := GP64(), GP64() - - Comment("r10, r11 = x0*2*x1") - MOVQ(x0, RAX) - SHLQ(Imm(1), RAX) - MULQ(x1) - MOVQ(RAX, r10) - MOVQ(RDX, r11) - - Comment("r10, r11 += x2*38*x4") - MOVQ(x2, RDX) - IMUL3Q(Imm(38), RDX, RAX) - MULQ(x4) - ADDQ(RAX, r10) - ADCQ(RDX, r11) - - Comment("r10, r11 += x3*19*x3") - MOVQ(x3, RDX) - IMUL3Q(Imm(19), RDX, RAX) - MULQ(x3) - ADDQ(RAX, r10) - ADCQ(RDX, r11) - - Comment("r2 = x0*2*x2 + x1*x1 + x3*38*x4\n") - r20, r21 := GP64(), GP64() - - Comment("r20, r21 = x0*2*x2") - MOVQ(x0, RAX) - SHLQ(Imm(1), RAX) - MULQ(x2) - MOVQ(RAX, r20) - MOVQ(RDX, r21) - - Comment("r20, r21 += x1*x1") - MOVQ(x1, RAX) - MULQ(x1) - ADDQ(RAX, r20) - ADCQ(RDX, r21) - - Comment("r20, r21 += x3*38*x4") - MOVQ(x3, RDX) - IMUL3Q(Imm(38), RDX, RAX) - MULQ(x4) - ADDQ(RAX, r20) - ADCQ(RDX, r21) - - Comment("r3 = x0*2*x3 + x1*2*x2 + x4*19*x4\n") - r30, r31 := GP64(), GP64() - - Comment("r30, r31 = x0*2*x3") - MOVQ(x0, RAX) - SHLQ(Imm(1), RAX) - MULQ(x3) - MOVQ(RAX, r30) - MOVQ(RDX, r31) - - Comment("r30, r31 += x1*2*x2") - MOVQ(x1, RAX) - SHLQ(Imm(1), RAX) - MULQ(x2) - ADDQ(RAX, r30) - ADCQ(RDX, r31) - - Comment("r30, r31 += x4*19*x4") - MOVQ(x4, RDX) - IMUL3Q(Imm(19), RDX, RAX) - MULQ(x4) - ADDQ(RAX, r30) - ADCQ(RDX, r31) - - Comment("r4 = x0*2*x4 + x1*2*x3 + x2*x2\n") - r40, r41 := GP64(), GP64() - - Comment("r40, r41 = x0*2*x4") - MOVQ(x0, RAX) - SHLQ(Imm(1), RAX) - MULQ(x4) - MOVQ(RAX, r40) - MOVQ(RDX, r41) - - Comment("r40, r41 += x1*2*x3") - MOVQ(x1, RAX) - SHLQ(Imm(1), RAX) - MULQ(x3) - ADDQ(RAX, r40) - ADCQ(RDX, r41) - - Comment("r40, r41 += x2*x2") - MOVQ(x2, RAX) - MULQ(x2) - ADDQ(RAX, r40) - ADCQ(RDX, r41) - - reduce64(r00, r01, r10, r11, r20, r21, r30, r31, r40, r41) - - Comment("Write out the results") - Load(Param("out"), a.Base) // Rewrite a.Base to be `out`, so it gets loaded next iteration. - MOVQ(r00, a.Offset(0)) - MOVQ(r10, a.Offset(8)) - MOVQ(r20, a.Offset(16)) - MOVQ(r30, a.Offset(24)) - MOVQ(r40, a.Offset(32)) - - DECQ(k) - JNZ(LabelRef("pow2k_loop")) - - RET() - - return nil -} diff --git a/internal/asm/amd64/gen.sh b/internal/asm/amd64/gen.sh index 7d0babd..f59921f 100644 --- a/internal/asm/amd64/gen.sh +++ b/internal/asm/amd64/gen.sh @@ -1,4 +1,3 @@ #!/bin/sh -go run field_u64.go common.go > ../../field/field_u64_amd64.s go run window.go common.go > ../../../curve/window_amd64.s go run edwards_vector.go common.go > ../../../curve/edwards_vector_amd64.s diff --git a/internal/field/field.go b/internal/field/field.go index 5069693..68227df 100644 --- a/internal/field/field.go +++ b/internal/field/field.go @@ -32,7 +32,13 @@ // Package field implements field arithmetic modulo p = 2^255 - 19. package field -import "github.com/oasisprotocol/curve25519-voi/internal/subtle" +import ( + "fmt" + + "github.com/oasisprotocol/curve25519-voi/internal/disalloweq" + "github.com/oasisprotocol/curve25519-voi/internal/subtle" + "github.com/oasisprotocol/curve25519-voi/internal/tony" +) const ( // ElementSize is the size of a field element in bytes. @@ -63,8 +69,16 @@ var ( two.Add(&One, &One) return two }() + + constTwoTimesNineteen = &tony.LooseFieldElement{2 * 19} ) +// Element represents an element of the field Z/(2^255 - 19). +type Element struct { + disalloweq.DisallowEqual //nolint:unused + inner tony.TightFieldElement +} + // Set sets fe to t, and returns fe. func (fe *Element) Set(t *Element) *Element { *fe = *t @@ -112,6 +126,144 @@ func (fe *Element) IsZero() int { return subtle.ConstantTimeCompareBytes(selfBytes[:], zeroBytes[:]) } +// Add sets `fe = a + b`, and returns fe. +func (fe *Element) Add(a, b *Element) *Element { + fe.inner.CarryAdd(&a.inner, &b.inner) + return fe +} + +// Sub sets `fe = a - b`, and returns fe. +func (fe *Element) Sub(a, b *Element) *Element { + fe.inner.CarrySub(&a.inner, &b.inner) + return fe +} + +// Mul sets `fe = a * b`, and returns fe. +func (fe *Element) Mul(a, b *Element) *Element { + fe.inner.CarryMul(a.inner.RelaxCast(), b.inner.RelaxCast()) + return fe +} + +// Neg sets `fe = -t`, and returns fe. +func (fe *Element) Neg(t *Element) *Element { + fe.inner.CarryOpp(&t.inner) + return fe +} + +// MulAdd sets `fe = a * (b + c)`, and returns fe. +func (fe *Element) MulAdd(a, b, c *Element) *Element { + fe.inner.CarryMulAdd(a.inner.RelaxCast(), &b.inner, &c.inner) + return fe +} + +// MulSub sets `fe = a * (b - c)`, and returns fe. +func (fe *Element) MulSub(a, b, c *Element) *Element { + fe.inner.CarryMulSub(a.inner.RelaxCast(), &b.inner, &c.inner) + return fe +} + +// SquareAdd sets `fe = (a + b)^2`, and returns fe. +func (fe *Element) SquareAdd(a, b *Element) *Element { + fe.inner.CarrySquareAdd(&a.inner, &b.inner) + return fe +} + +// StrictReduce fully-reduces the field element, and returns fe. +func (fe *Element) StrictReduce() *Element { + fe.inner.Carry(fe.inner.RelaxCast()) + return fe +} + +// SetBytes loads a field element from the low 255 bits of a 256 bit input. +// +// WARNING: This function does not check that the input used the canonical +// representative. It masks the high bit, but it will happily decode +// 2^255 - 18 to 1. Applications that require a canonical encoding of +// every field element should decode, re-encode to the canonical encoding, +// and check that the input was canonical. +func (fe *Element) SetBytes(in []byte) (*Element, error) { + if len(in) != ElementSize { + return nil, fmt.Errorf("internal/field/fiat: unexpected input size") + } + + var t0 [32]byte + copy(t0[:], in) + t0[31] &= 127 + + fe.inner.FromBytes(&t0) + + return fe, nil +} + +// SetBytesWide loads a field element from a 512-bit little-endian input. +func (fe *Element) SetBytesWide(in []byte) (*Element, error) { + if len(in) != ElementWideSize { + return nil, fmt.Errorf("internal/field/fiat: unexpected input size") + } + + loMSB := uint8(in[31] >> 7) + hiMSB := uint8(in[63] >> 7) + + var t0, t1 [32]byte + copy(t0[:], in[:ElementSize]) + copy(t1[:], in[ElementSize:]) + t0[31] &= 127 + t1[31] &= 127 + + var lo, hi tony.TightFieldElement + lo.FromBytes(&t0) + hi.FromBytes(&t1) + + // Do this the hard way (aka "slow") way, so that we respect the + // bounds of LooseFieldElement. + var carry, loPlusCarry tony.TightFieldElement + carry[0] = tony.Uint8ToLimb(loMSB)*19 + tony.Uint8ToLimb(hiMSB)*2*19*19 // Trivially fits. + loPlusCarry.CarryAdd(&lo, &carry) + + hi.CarryMul(hi.RelaxCast(), constTwoTimesNineteen) + + fe.inner.CarryAdd(&loPlusCarry, &hi) + + return fe, nil +} + +// ToBytes packs the field element into 32 bytes. The encoding is canonical. +func (fe *Element) ToBytes(out []byte) error { + if len(out) != ElementSize { + return fmt.Errorf("internal/field/fiat: unexpected output size") + } + + var t0 [32]byte + fe.inner.ToBytes(&t0) + copy(out, t0[:]) + + return nil +} + +// Pow2k sets `fe = t^(2^k)`, given `k > 0`, and returns fe +func (fe *Element) Pow2k(t *Element, k uint) *Element { + if k == 0 { + panic("internal/field/fiat: k out of bounds") + } + + fe.inner.CarryPow2k(t.inner.RelaxCast(), k) + + return fe +} + +// Square sets `fe = t^2`, and returns fe. +func (fe *Element) Square(t *Element) *Element { + fe.inner.CarrySquare(t.inner.RelaxCast()) + return fe +} + +// Square2 sets `fe = 2*t^2`, and returns fe. +func (fe *Element) Square2(t *Element) *Element { + fe.Square(t) + fe.Add(fe, fe) + return fe +} + // Invert sets fe to the multiplicative inverse of t, and returns fe. // // The inverse is computed as self^(p-2), since x^(p-2)x = x^(p-1) = 1 (mod p). @@ -276,3 +428,9 @@ func BatchInvert(inputs []*Element) { acc = tmp } } + +// UnsafeInner exposes the inner actual field element to allow for things +// like vector implementations. +func (fe *Element) UnsafeInner() *tony.TightFieldElement { + return &fe.inner +} diff --git a/internal/field/field_u32.go b/internal/field/field_u32.go index 2e0c0e0..5723143 100644 --- a/internal/field/field_u32.go +++ b/internal/field/field_u32.go @@ -35,140 +35,7 @@ package field -import ( - "fmt" - - "github.com/oasisprotocol/curve25519-voi/internal/disalloweq" - "github.com/oasisprotocol/curve25519-voi/internal/subtle" -) - -func m(x, y uint32) uint64 { - // See the comment in curve/scalar/scalar_u32.go as to why this - // does not use `bits.Mul32`. - return uint64(x) * uint64(y) -} - -// Element represents an element of the field Z/(2^255 - 19). -type Element struct { - disalloweq.DisallowEqual //nolint:unused - inner [10]uint32 -} - -// Add sets `fe = a + b`, and returns fe. -func (fe *Element) Add(a, b *Element) *Element { - fe.inner[0] = a.inner[0] + b.inner[0] - fe.inner[1] = a.inner[1] + b.inner[1] - fe.inner[2] = a.inner[2] + b.inner[2] - fe.inner[3] = a.inner[3] + b.inner[3] - fe.inner[4] = a.inner[4] + b.inner[4] - fe.inner[5] = a.inner[5] + b.inner[5] - fe.inner[6] = a.inner[6] + b.inner[6] - fe.inner[7] = a.inner[7] + b.inner[7] - fe.inner[8] = a.inner[8] + b.inner[8] - fe.inner[9] = a.inner[9] + b.inner[9] - - return fe -} - -// Sub sets `fe = a - b`, and returns fe. -func (fe *Element) Sub(a, b *Element) *Element { - // Compute a - b as ((a + 2^4 * p) - b) to avoid underflow. - return fe.reduce(&[10]uint64{ - uint64((a.inner[0] + (0x3ffffed << 4)) - b.inner[0]), - uint64((a.inner[1] + (0x1ffffff << 4)) - b.inner[1]), - uint64((a.inner[2] + (0x3ffffff << 4)) - b.inner[2]), - uint64((a.inner[3] + (0x1ffffff << 4)) - b.inner[3]), - uint64((a.inner[4] + (0x3ffffff << 4)) - b.inner[4]), - uint64((a.inner[5] + (0x1ffffff << 4)) - b.inner[5]), - uint64((a.inner[6] + (0x3ffffff << 4)) - b.inner[6]), - uint64((a.inner[7] + (0x1ffffff << 4)) - b.inner[7]), - uint64((a.inner[8] + (0x3ffffff << 4)) - b.inner[8]), - uint64((a.inner[9] + (0x1ffffff << 4)) - b.inner[9]), - }) -} - -// Mul sets `fe = a * b`, and returns fe. -func (fe *Element) Mul(a, b *Element) *Element { - x, y := a.inner, b.inner - - // We assume that the input limbs x[i], y[i] are bounded by: - // - // x[i], y[i] < 2^(26 + b) if i even - // x[i], y[i] < 2^(25 + b) if i odd - // - // where b is a (real) parameter representing the excess bits of - // the limbs. We track the bitsizes of all variables through - // the computation and solve at the end for the allowable - // headroom bitsize b (which determines how many additions we - // can perform between reductions or multiplications). - - y1_19 := 19 * y[1] // This fits in a u32 - y2_19 := 19 * y[2] // iff 26 + b + lg(19) < 32 - y3_19 := 19 * y[3] // if b < 32 - 26 - 4.248 = 1.752 - y4_19 := 19 * y[4] - y5_19 := 19 * y[5] // below, b<2.5: this is a bottleneck, - y6_19 := 19 * y[6] // could be avoided by promoting to - y7_19 := 19 * y[7] // u64 here instead of in m() - y8_19 := 19 * y[8] - y9_19 := 19 * y[9] - - // What happens when we multiply x[i] with y[j] and place the - // result into the (i+j)-th limb? - // - // x[i] represents the value x[i]*2^ceil(i*51/2) - // y[j] represents the value y[j]*2^ceil(j*51/2) - // z[i+j] represents the value z[i+j]*2^ceil((i+j)*51/2) - // x[i]*y[j] represents the value x[i]*y[i]*2^(ceil(i*51/2)+ceil(j*51/2)) - // - // Since the radix is already accounted for, the result placed - // into the (i+j)-th limb should be - // - // x[i]*y[i]*2^(ceil(i*51/2)+ceil(j*51/2) - ceil((i+j)*51/2)). - // - // The value of ceil(i*51/2)+ceil(j*51/2) - ceil((i+j)*51/2) is - // 1 when both i and j are odd, and 0 otherwise. So we add - // - // x[i]*y[j] if either i or j is even - // 2*x[i]*y[j] if i and j are both odd - // - // by using precomputed multiples of x[i] for odd i: - - x1_2 := 2 * x[1] // This fits in a u32 iff 25 + b + 1 < 32 - x3_2 := 2 * x[3] // iff b < 6 - x5_2 := 2 * x[5] - x7_2 := 2 * x[7] - x9_2 := 2 * x[9] - - z0 := m(x[0], y[0]) + m(x1_2, y9_19) + m(x[2], y8_19) + m(x3_2, y7_19) + m(x[4], y6_19) + m(x5_2, y5_19) + m(x[6], y4_19) + m(x7_2, y3_19) + m(x[8], y2_19) + m(x9_2, y1_19) - z1 := m(x[0], y[1]) + m(x[1], y[0]) + m(x[2], y9_19) + m(x[3], y8_19) + m(x[4], y7_19) + m(x[5], y6_19) + m(x[6], y5_19) + m(x[7], y4_19) + m(x[8], y3_19) + m(x[9], y2_19) - z2 := m(x[0], y[2]) + m(x1_2, y[1]) + m(x[2], y[0]) + m(x3_2, y9_19) + m(x[4], y8_19) + m(x5_2, y7_19) + m(x[6], y6_19) + m(x7_2, y5_19) + m(x[8], y4_19) + m(x9_2, y3_19) - z3 := m(x[0], y[3]) + m(x[1], y[2]) + m(x[2], y[1]) + m(x[3], y[0]) + m(x[4], y9_19) + m(x[5], y8_19) + m(x[6], y7_19) + m(x[7], y6_19) + m(x[8], y5_19) + m(x[9], y4_19) - z4 := m(x[0], y[4]) + m(x1_2, y[3]) + m(x[2], y[2]) + m(x3_2, y[1]) + m(x[4], y[0]) + m(x5_2, y9_19) + m(x[6], y8_19) + m(x7_2, y7_19) + m(x[8], y6_19) + m(x9_2, y5_19) - z5 := m(x[0], y[5]) + m(x[1], y[4]) + m(x[2], y[3]) + m(x[3], y[2]) + m(x[4], y[1]) + m(x[5], y[0]) + m(x[6], y9_19) + m(x[7], y8_19) + m(x[8], y7_19) + m(x[9], y6_19) - z6 := m(x[0], y[6]) + m(x1_2, y[5]) + m(x[2], y[4]) + m(x3_2, y[3]) + m(x[4], y[2]) + m(x5_2, y[1]) + m(x[6], y[0]) + m(x7_2, y9_19) + m(x[8], y8_19) + m(x9_2, y7_19) - z7 := m(x[0], y[7]) + m(x[1], y[6]) + m(x[2], y[5]) + m(x[3], y[4]) + m(x[4], y[3]) + m(x[5], y[2]) + m(x[6], y[1]) + m(x[7], y[0]) + m(x[8], y9_19) + m(x[9], y8_19) - z8 := m(x[0], y[8]) + m(x1_2, y[7]) + m(x[2], y[6]) + m(x3_2, y[5]) + m(x[4], y[4]) + m(x5_2, y[3]) + m(x[6], y[2]) + m(x7_2, y[1]) + m(x[8], y[0]) + m(x9_2, y9_19) - z9 := m(x[0], y[9]) + m(x[1], y[8]) + m(x[2], y[7]) + m(x[3], y[6]) + m(x[4], y[5]) + m(x[5], y[4]) + m(x[6], y[3]) + m(x[7], y[2]) + m(x[8], y[1]) + m(x[9], y[0]) - - return fe.reduce(&[10]uint64{z0, z1, z2, z3, z4, z5, z6, z7, z8, z9}) -} - -// Neg sets `fe = -t`, and returns fe. -func (fe *Element) Neg(t *Element) *Element { - // Compute -b as ((2^4 * p) - b) to avoid underflow. - return fe.reduce(&[10]uint64{ - uint64((0x3ffffed << 4) - t.inner[0]), - uint64((0x1ffffff << 4) - t.inner[1]), - uint64((0x3ffffff << 4) - t.inner[2]), - uint64((0x1ffffff << 4) - t.inner[3]), - uint64((0x3ffffff << 4) - t.inner[4]), - uint64((0x1ffffff << 4) - t.inner[5]), - uint64((0x3ffffff << 4) - t.inner[6]), - uint64((0x1ffffff << 4) - t.inner[7]), - uint64((0x3ffffff << 4) - t.inner[8]), - uint64((0x1ffffff << 4) - t.inner[9]), - }) -} +import "github.com/oasisprotocol/curve25519-voi/internal/subtle" // ConditionalSelect sets the field element to a iff choice == 0 and // b iff choice == 1. @@ -228,312 +95,6 @@ func (fe *Element) MinusOne() *Element { return fe } -func (fe *Element) reduce(z *[10]uint64) *Element { - const ( - low_25_bit_mask uint64 = (1 << 25) - 1 - low_26_bit_mask uint64 = (1 << 26) - 1 - ) - - carry := func(z *[10]uint64, i uint) { - switch i & 1 { - case 0: - // Even limbs have 26 bits. - z[i+1] += z[i] >> 26 - z[i] &= low_26_bit_mask - case 1: - // Odd limbs have 25 bits. - z[i+1] += z[i] >> 25 - z[i] &= low_25_bit_mask - } - } - - // Perform two halves of the carry chain in parallel. - carry(z, 0) - carry(z, 4) - carry(z, 1) - carry(z, 5) - carry(z, 2) - carry(z, 6) - carry(z, 3) - carry(z, 7) - // Since z[3] < 2^64, c < 2^(64-25) = 2^39, - // so z[4] < 2^26 + 2^39 < 2^39.0002 - carry(z, 4) - carry(z, 8) - // Now z[4] < 2^26 - // and z[5] < 2^25 + 2^13.0002 < 2^25.0004 (good enough) - - // Last carry has a multiplication by 19: - z[0] += 19 * (z[9] >> 25) - z[9] &= low_25_bit_mask - - // Since z[9] < 2^64, c < 2^(64-25) = 2^39, - // so z[0] + 19*c < 2^26 + 2^43.248 < 2^43.249. - carry(z, 0) - // Now z[1] < 2^25 - 2^(43.249 - 26) - // < 2^25.007 (good enough) - // and we're done. - - fe.inner[0] = uint32(z[0]) - fe.inner[1] = uint32(z[1]) - fe.inner[2] = uint32(z[2]) - fe.inner[3] = uint32(z[3]) - fe.inner[4] = uint32(z[4]) - fe.inner[5] = uint32(z[5]) - fe.inner[6] = uint32(z[6]) - fe.inner[7] = uint32(z[7]) - fe.inner[8] = uint32(z[8]) - fe.inner[9] = uint32(z[9]) - - return fe -} - -// SetBytes loads a field element from the low 255-bits of a 256-bit input. -// -// WARNING: This function does not check that the input used the canonical -// representative. It masks the high bit, but it will happily decode -// 2^255 - 18 to 1. Applications that require a canonical encoding of -// every field element should decode, re-encode to the canonical encoding, -// and check that the input was canonical. -func (fe *Element) SetBytes(in []byte) (*Element, error) { - if len(in) != ElementSize { - return nil, fmt.Errorf("internal/field/u32: unexpected in size") - } - - load3 := func(b []byte) uint64 { - return uint64(b[0]) | (uint64(b[1]) << 8) | (uint64(b[2]) << 16) - } - load4 := func(b []byte) uint64 { - return uint64(b[0]) | (uint64(b[1]) << 8) | (uint64(b[2]) << 16) | (uint64(b[3]) << 24) - } - - var h [10]uint64 - const low_23_bit_mask uint64 = (1 << 23) - 1 - h[0] = load4(in[0:4]) - h[1] = load3(in[4:7]) << 6 - h[2] = load3(in[7:10]) << 5 - h[3] = load3(in[10:13]) << 3 - h[4] = load3(in[13:16]) << 2 - h[5] = load4(in[16:20]) - h[6] = load3(in[20:23]) << 7 - h[7] = load3(in[23:26]) << 5 - h[8] = load3(in[26:29]) << 4 - h[9] = (load3(in[29:32]) & low_23_bit_mask) << 2 - - fe.reduce(&h) - - return fe, nil -} - -// SetBytesWide loads a field element from a 512-bit little-endian input. -func (fe *Element) SetBytesWide(in []byte) (*Element, error) { - if len(in) != ElementWideSize { - return nil, fmt.Errorf("internal/field/u32: unexpected input size") - } - - var lo, hi Element - if _, err := lo.SetBytes(in[:ElementSize]); err != nil { - return nil, fmt.Errorf("internal/field/u32: failed to deserialize lo: %w", err) - } - if _, err := hi.SetBytes(in[ElementSize:]); err != nil { - return nil, fmt.Errorf("internal/field/u32: failed to deserialize hi: %w", err) - } - - // Handle the 256th and 512th bits (MSB of lo and hi) explicitly - // as SetBytes ignores them. - lo.inner[0] += uint32(in[31]>>7)*19 + uint32(in[63]>>7)*2*19*19 - - // Back-of-the-envelope math says that this will fit without widening, - // but might as well do it this way. - return fe.reduce(&[10]uint64{ - uint64(lo.inner[0]) + 2*19*uint64(hi.inner[0]), - uint64(lo.inner[1]) + 2*19*uint64(hi.inner[1]), - uint64(lo.inner[2]) + 2*19*uint64(hi.inner[2]), - uint64(lo.inner[3]) + 2*19*uint64(hi.inner[3]), - uint64(lo.inner[4]) + 2*19*uint64(hi.inner[4]), - uint64(lo.inner[5]) + 2*19*uint64(hi.inner[5]), - uint64(lo.inner[6]) + 2*19*uint64(hi.inner[6]), - uint64(lo.inner[7]) + 2*19*uint64(hi.inner[7]), - uint64(lo.inner[8]) + 2*19*uint64(hi.inner[8]), - uint64(lo.inner[9]) + 2*19*uint64(hi.inner[9]), - }), nil -} - -// ToBytes packs the field element into 32 bytes. The encoding is canonical. -func (fe *Element) ToBytes(out []byte) error { - if len(out) != ElementSize { - return fmt.Errorf("internal/field/u32: unexpected output size") - } - - // Reduce the value represented by `fe` to the range [0,2*p) - var reduced Element - reduced.reduce(&[10]uint64{ - uint64(fe.inner[0]), uint64(fe.inner[1]), uint64(fe.inner[2]), uint64(fe.inner[3]), uint64(fe.inner[4]), - uint64(fe.inner[5]), uint64(fe.inner[6]), uint64(fe.inner[7]), uint64(fe.inner[8]), uint64(fe.inner[9]), - }) - - h0, h1, h2, h3, h4, h5, h6, h7, h8, h9 := reduced.inner[0], reduced.inner[1], reduced.inner[2], reduced.inner[3], reduced.inner[4], reduced.inner[5], reduced.inner[6], reduced.inner[7], reduced.inner[8], reduced.inner[9] - - // Let h be the value to encode. - // - // Write h = pq + r with 0 <= r < p. We want to compute r = h mod p. - // - // Since h < 2*p, q = 0 or 1, with q = 0 when h < p and q = 1 when h >= p. - // - // Notice that h >= p <==> h + 19 >= p + 19 <==> h + 19 >= 2^255. - // Therefore q can be computed as the carry bit of h + 19. - - q := (h0 + 19) >> 26 - q = (h1 + q) >> 25 - q = (h2 + q) >> 26 - q = (h3 + q) >> 25 - q = (h4 + q) >> 26 - q = (h5 + q) >> 25 - q = (h6 + q) >> 26 - q = (h7 + q) >> 25 - q = (h8 + q) >> 26 - q = (h9 + q) >> 25 - - // Now we can compute r as r = h - pq = r - (2^255-19)q = r + 19q - 2^255q - - const ( - low_25_bit_mask uint32 = (1 << 25) - 1 - low_26_bit_mask uint32 = (1 << 26) - 1 - ) - - h0 += 19 * q - - // Now carry the result to compute r + 19q... - h1 += h0 >> 26 - h0 = h0 & low_26_bit_mask - h2 += h1 >> 25 - h1 = h1 & low_25_bit_mask - h3 += h2 >> 26 - h2 = h2 & low_26_bit_mask - h4 += h3 >> 25 - h3 = h3 & low_25_bit_mask - h5 += h4 >> 26 - h4 = h4 & low_26_bit_mask - h6 += h5 >> 25 - h5 = h5 & low_25_bit_mask - h7 += h6 >> 26 - h6 = h6 & low_26_bit_mask - h8 += h7 >> 25 - h7 = h7 & low_25_bit_mask - h9 += h8 >> 26 - h8 = h8 & low_26_bit_mask - - // ... but instead of carrying the value - // (h9 >> 25) = q*2^255 into another limb, - // discard it, subtracting the value from h. - h9 = h9 & low_25_bit_mask - - out[0] = byte(h0 >> 0) - out[1] = byte(h0 >> 8) - out[2] = byte(h0 >> 16) - out[3] = byte((h0 >> 24) | (h1 << 2)) - out[4] = byte(h1 >> 6) - out[5] = byte(h1 >> 14) - out[6] = byte((h1 >> 22) | (h2 << 3)) - out[7] = byte(h2 >> 5) - out[8] = byte(h2 >> 13) - out[9] = byte((h2 >> 21) | (h3 << 5)) - out[10] = byte(h3 >> 3) - out[11] = byte(h3 >> 11) - out[12] = byte((h3 >> 19) | (h4 << 6)) - out[13] = byte(h4 >> 2) - out[14] = byte(h4 >> 10) - out[15] = byte(h4 >> 18) - out[16] = byte(h5 >> 0) - out[17] = byte(h5 >> 8) - out[18] = byte(h5 >> 16) - out[19] = byte((h5 >> 24) | (h6 << 1)) - out[20] = byte(h6 >> 7) - out[21] = byte(h6 >> 15) - out[22] = byte((h6 >> 23) | (h7 << 3)) - out[23] = byte(h7 >> 5) - out[24] = byte(h7 >> 13) - out[25] = byte((h7 >> 21) | (h8 << 4)) - out[26] = byte(h8 >> 4) - out[27] = byte(h8 >> 12) - out[28] = byte((h8 >> 20) | (h9 << 6)) - out[29] = byte(h9 >> 2) - out[30] = byte(h9 >> 10) - out[31] = byte(h9 >> 18) - - return nil -} - -// Pow2k sets `fe = t^(2^k)`, given `k > 0`, and returns fe. -func (fe *Element) Pow2k(t *Element, k uint) *Element { - if k == 0 { - panic("internal/field/u32: k out of bounds") - } - - var z [10]uint64 - - // Handle the first squaring separately to save a copy. - squareInner(&t.inner, &z) - fe.reduce(&z) - - // And do the rest. - for ; k > 1; k-- { - squareInner(&fe.inner, &z) - fe.reduce(&z) - } - - return fe -} - -// Square sets `fe = t^2`, and returns fe. -func (fe *Element) Square(t *Element) *Element { - var z [10]uint64 - squareInner(&t.inner, &z) - return fe.reduce(&z) -} - -// Square2 sets `fe = 2*t^2`, and returns fe. -func (fe *Element) Square2(t *Element) *Element { - var z [10]uint64 - squareInner(&t.inner, &z) - for i := 0; i < 10; i++ { - z[i] *= 2 - } - return fe.reduce(&z) -} - -func squareInner(x *[10]uint32, z *[10]uint64) { - // Optimized version of multiplication for the case of squaring. - // Pre- and post- conditions identical to multiplication function. - x0_2 := 2 * x[0] - x1_2 := 2 * x[1] - x2_2 := 2 * x[2] - x3_2 := 2 * x[3] - x4_2 := 2 * x[4] - x5_2 := 2 * x[5] - x6_2 := 2 * x[6] - x7_2 := 2 * x[7] - x5_19 := 19 * x[5] - x6_19 := 19 * x[6] - x7_19 := 19 * x[7] - x8_19 := 19 * x[8] - x9_19 := 19 * x[9] - - // This block is rearranged so that instead of doing a 32-bit multiplication by 38, we do a - // 64-bit multiplication by 2 on the results. This is because lg(38) is too big: we would - // have less than 1 bit of headroom left, which is too little. - z[0] = m(x[0], x[0]) + m(x2_2, x8_19) + m(x4_2, x6_19) + (m(x1_2, x9_19)+m(x3_2, x7_19)+m(x[5], x5_19))*2 - z[1] = m(x0_2, x[1]) + m(x3_2, x8_19) + m(x5_2, x6_19) + (m(x[2], x9_19)+m(x[4], x7_19))*2 - z[2] = m(x0_2, x[2]) + m(x1_2, x[1]) + m(x4_2, x8_19) + m(x[6], x6_19) + (m(x3_2, x9_19)+m(x5_2, x7_19))*2 - z[3] = m(x0_2, x[3]) + m(x1_2, x[2]) + m(x5_2, x8_19) + (m(x[4], x9_19)+m(x[6], x7_19))*2 - z[4] = m(x0_2, x[4]) + m(x1_2, x3_2) + m(x[2], x[2]) + m(x6_2, x8_19) + (m(x5_2, x9_19)+m(x[7], x7_19))*2 - z[5] = m(x0_2, x[5]) + m(x1_2, x[4]) + m(x2_2, x[3]) + m(x7_2, x8_19) + m(x[6], x9_19)*2 - z[6] = m(x0_2, x[6]) + m(x1_2, x5_2) + m(x2_2, x[4]) + m(x3_2, x[3]) + m(x[8], x8_19) + m(x7_2, x9_19)*2 - z[7] = m(x0_2, x[7]) + m(x1_2, x[6]) + m(x2_2, x[5]) + m(x3_2, x[4]) + m(x[8], x9_19)*2 - z[8] = m(x0_2, x[8]) + m(x1_2, x7_2) + m(x2_2, x[6]) + m(x3_2, x5_2) + m(x[4], x[4]) + m(x[9], x9_19)*2 - z[9] = m(x0_2, x[9]) + m(x1_2, x[8]) + m(x2_2, x[7]) + m(x3_2, x[6]) + m(x4_2, x[5]) -} - // NewElement2625 constructs a field element from its raw component limbs. func NewElement2625(l0, l1, l2, l3, l4, l5, l6, l7, l8, l9 uint32) Element { return Element{ diff --git a/internal/field/field_u64.go b/internal/field/field_u64.go index bca6078..319cb72 100644 --- a/internal/field/field_u64.go +++ b/internal/field/field_u64.go @@ -35,233 +35,13 @@ package field -import ( - "encoding/binary" - "fmt" - "math/bits" - - "github.com/oasisprotocol/curve25519-voi/internal/disalloweq" - "github.com/oasisprotocol/curve25519-voi/internal/subtle" -) - -const ( - low_51_bit_mask uint64 = (1 << 51) - 1 - - // 16 * p - p_times_sixteen_0 = 36028797018963664 - p_times_sixteen_1234 = 36028797018963952 -) - -// Element represents an element of the field Z/(2^255 - 19). -type Element struct { - disalloweq.DisallowEqual //nolint:unused - inner [5]uint64 -} - -// Add sets `fe = a + b`, and returns fe. -func (fe *Element) Add(a, b *Element) *Element { - fe.inner[0] = a.inner[0] + b.inner[0] - fe.inner[1] = a.inner[1] + b.inner[1] - fe.inner[2] = a.inner[2] + b.inner[2] - fe.inner[3] = a.inner[3] + b.inner[3] - fe.inner[4] = a.inner[4] + b.inner[4] - return fe -} - -// Sub sets `fe = a - b`, and returns fe. -func (fe *Element) Sub(a, b *Element) *Element { - // To avoid underflow, first add a multiple of p. - // Choose 16*p = p << 4 to be larger than 54-bit b. - // - // If we could statically track the bitlengths of the limbs - // of every Element, we could choose a multiple of p - // just bigger than b and avoid having to do a reduction. - - return fe.reduce(&[5]uint64{ - (a.inner[0] + p_times_sixteen_0) - b.inner[0], - (a.inner[1] + p_times_sixteen_1234) - b.inner[1], - (a.inner[2] + p_times_sixteen_1234) - b.inner[2], - (a.inner[3] + p_times_sixteen_1234) - b.inner[3], - (a.inner[4] + p_times_sixteen_1234) - b.inner[4], - }) -} - -// Mul sets `fe =a * b`, and returns fe. -func (fe *Element) Mul(a, b *Element) *Element { - feMul(fe, a, b) - return fe -} - -func feMulGeneric(fe, a, b *Element) { //nolint:unused,deadcode - a0, a1, a2, a3, a4 := a.inner[0], a.inner[1], a.inner[2], a.inner[3], a.inner[4] - b0, b1, b2, b3, b4 := b.inner[0], b.inner[1], b.inner[2], b.inner[3], b.inner[4] - - // Precondition: assume input limbs a[i], b[i] are bounded as - // - // a[i], b[i] < 2^(51 + b) - // - // where b is a real parameter measuring the "bit excess" of the limbs. - - // 64-bit precomputations to avoid 128-bit multiplications. - // - // This fits into a u64 whenever 51 + b + lg(19) < 64. - // - // Since 51 + b + lg(19) < 51 + 4.25 + b - // = 55.25 + b, - // this fits if b < 8.75. - b1_19 := b1 * 19 - b2_19 := b2 * 19 - b3_19 := b3 * 19 - b4_19 := b4 * 19 - - // Multiply to get 128-bit coefficients of output - var carry uint64 - - c0_hi, c0_lo := bits.Mul64(a0, b0) - t0_hi, t0_lo := bits.Mul64(a4, b1_19) - c0_lo, carry = bits.Add64(c0_lo, t0_lo, 0) - c0_hi, _ = bits.Add64(c0_hi, t0_hi, carry) - t0_hi, t0_lo = bits.Mul64(a3, b2_19) - c0_lo, carry = bits.Add64(c0_lo, t0_lo, 0) - c0_hi, _ = bits.Add64(c0_hi, t0_hi, carry) - t0_hi, t0_lo = bits.Mul64(a2, b3_19) - c0_lo, carry = bits.Add64(c0_lo, t0_lo, 0) - c0_hi, _ = bits.Add64(c0_hi, t0_hi, carry) - t0_hi, t0_lo = bits.Mul64(a1, b4_19) - c0_lo, carry = bits.Add64(c0_lo, t0_lo, 0) - c0_hi, _ = bits.Add64(c0_hi, t0_hi, carry) - - c1_hi, c1_lo := bits.Mul64(a1, b0) - t1_hi, t1_lo := bits.Mul64(a0, b1) - c1_lo, carry = bits.Add64(c1_lo, t1_lo, 0) - c1_hi, _ = bits.Add64(c1_hi, t1_hi, carry) - t1_hi, t1_lo = bits.Mul64(a4, b2_19) - c1_lo, carry = bits.Add64(c1_lo, t1_lo, 0) - c1_hi, _ = bits.Add64(c1_hi, t1_hi, carry) - t1_hi, t1_lo = bits.Mul64(a3, b3_19) - c1_lo, carry = bits.Add64(c1_lo, t1_lo, 0) - c1_hi, _ = bits.Add64(c1_hi, t1_hi, carry) - t1_hi, t1_lo = bits.Mul64(a2, b4_19) - c1_lo, carry = bits.Add64(c1_lo, t1_lo, 0) - c1_hi, _ = bits.Add64(c1_hi, t1_hi, carry) - - c2_hi, c2_lo := bits.Mul64(a2, b0) - t2_hi, t2_lo := bits.Mul64(a1, b1) - c2_lo, carry = bits.Add64(c2_lo, t2_lo, 0) - c2_hi, _ = bits.Add64(c2_hi, t2_hi, carry) - t2_hi, t2_lo = bits.Mul64(a0, b2) - c2_lo, carry = bits.Add64(c2_lo, t2_lo, 0) - c2_hi, _ = bits.Add64(c2_hi, t2_hi, carry) - t2_hi, t2_lo = bits.Mul64(a4, b3_19) - c2_lo, carry = bits.Add64(c2_lo, t2_lo, 0) - c2_hi, _ = bits.Add64(c2_hi, t2_hi, carry) - t2_hi, t2_lo = bits.Mul64(a3, b4_19) - c2_lo, carry = bits.Add64(c2_lo, t2_lo, 0) - c2_hi, _ = bits.Add64(c2_hi, t2_hi, carry) - - c3_hi, c3_lo := bits.Mul64(a3, b0) - t3_hi, t3_lo := bits.Mul64(a2, b1) - c3_lo, carry = bits.Add64(c3_lo, t3_lo, 0) - c3_hi, _ = bits.Add64(c3_hi, t3_hi, carry) - t3_hi, t3_lo = bits.Mul64(a1, b2) - c3_lo, carry = bits.Add64(c3_lo, t3_lo, 0) - c3_hi, _ = bits.Add64(c3_hi, t3_hi, carry) - t3_hi, t3_lo = bits.Mul64(a0, b3) - c3_lo, carry = bits.Add64(c3_lo, t3_lo, 0) - c3_hi, _ = bits.Add64(c3_hi, t3_hi, carry) - t3_hi, t3_lo = bits.Mul64(a4, b4_19) - c3_lo, carry = bits.Add64(c3_lo, t3_lo, 0) - c3_hi, _ = bits.Add64(c3_hi, t3_hi, carry) - - c4_hi, c4_lo := bits.Mul64(a4, b0) - t4_hi, t4_lo := bits.Mul64(a3, b1) - c4_lo, carry = bits.Add64(c4_lo, t4_lo, 0) - c4_hi, _ = bits.Add64(c4_hi, t4_hi, carry) - t4_hi, t4_lo = bits.Mul64(a2, b2) - c4_lo, carry = bits.Add64(c4_lo, t4_lo, 0) - c4_hi, _ = bits.Add64(c4_hi, t4_hi, carry) - t4_hi, t4_lo = bits.Mul64(a1, b3) - c4_lo, carry = bits.Add64(c4_lo, t4_lo, 0) - c4_hi, _ = bits.Add64(c4_hi, t4_hi, carry) - t4_hi, t4_lo = bits.Mul64(a0, b4) - c4_lo, carry = bits.Add64(c4_lo, t4_lo, 0) - c4_hi, _ = bits.Add64(c4_hi, t4_hi, carry) - - // How big are the c[i]? We have - // - // c[i] < 2^(102 + 2*b) * (1+i + (4-i)*19) - // < 2^(102 + lg(1 + 4*19) + 2*b) - // < 2^(108.27 + 2*b) - // - // The carry (c[i] >> 51) fits into a u64 when - // 108.27 + 2*b - 51 < 64 - // 2*b < 6.73 - // b < 3.365. - // - // So we require b < 3 to ensure this fits. - - tmp := (c0_hi << (64 - 51)) | (c0_lo >> 51) - c1_lo, carry = bits.Add64(c1_lo, tmp, 0) - c1_hi, _ = bits.Add64(c1_hi, 0, carry) - fe0 := c0_lo & low_51_bit_mask - - tmp = (c1_hi << (64 - 51)) | (c1_lo >> 51) - c2_lo, carry = bits.Add64(c2_lo, tmp, 0) - c2_hi, _ = bits.Add64(c2_hi, 0, carry) - fe1 := c1_lo & low_51_bit_mask - - tmp = (c2_hi << (64 - 51)) | (c2_lo >> 51) - c3_lo, carry = bits.Add64(c3_lo, tmp, 0) - c3_hi, _ = bits.Add64(c3_hi, 0, carry) - fe.inner[2] = c2_lo & low_51_bit_mask - - tmp = (c3_hi << (64 - 51)) | (c3_lo >> 51) - c4_lo, carry = bits.Add64(c4_lo, tmp, 0) - c4_hi, _ = bits.Add64(c4_hi, 0, carry) - fe.inner[3] = c3_lo & low_51_bit_mask - - carry = (c4_hi << (64 - 51)) | (c4_lo >> 51) - fe.inner[4] = c4_lo & low_51_bit_mask - - // To see that this does not overflow, we need fe[0] + carry * 19 < 2^64. - // - // c4 < a0*b4 + a1*b3 + a2*b2 + a3*b1 + a4*b0 + (carry from c3) - // < 5*(2^(51 + b) * 2^(51 + b)) + (carry from c3) - // < 2^(102 + 2*b + lg(5)) + 2^64. - // - // When b < 3 we get - // - // c4 < 2^110.33 so that carry < 2^59.33 - // - // so that - // - // fe[0] + carry * 19 < 2^51 + 19 * 2^59.33 < 2^63.58 - // - // and there is no overflow. - fe0 = fe0 + carry*19 - - // Now fe[1] < 2^51 + 2^(64 -51) = 2^51 + 2^13 < 2^(51 + epsilon). - fe.inner[1] = fe1 + (fe0 >> 51) - fe.inner[0] = fe0 & low_51_bit_mask - - // Now fe[i] < 2^(51 + epsilon) for all i. -} - -// Neg sets `fe = -t`, and returns fe. -func (fe *Element) Neg(t *Element) *Element { - // See commentary in the Sub impl. - return fe.reduce(&[5]uint64{ - p_times_sixteen_0 - t.inner[0], - p_times_sixteen_1234 - t.inner[1], - p_times_sixteen_1234 - t.inner[2], - p_times_sixteen_1234 - t.inner[3], - p_times_sixteen_1234 - t.inner[4], - }) -} +import "github.com/oasisprotocol/curve25519-voi/internal/subtle" // ConditionalSelect sets the field element to a iff choice == 0 and // b iff choice == 1. func (fe *Element) ConditionalSelect(a, b *Element, choice int) { + // This would use fiat.Selectznz, but arg1 takes a fiat.uint1, which + // is unexported, so the routine is useless. fe.inner[0] = subtle.ConstantTimeSelectUint64(choice, b.inner[0], a.inner[0]) fe.inner[1] = subtle.ConstantTimeSelectUint64(choice, b.inner[1], a.inner[1]) fe.inner[2] = subtle.ConstantTimeSelectUint64(choice, b.inner[2], a.inner[2]) @@ -301,358 +81,6 @@ func (fe *Element) MinusOne() *Element { return fe } -func (fe *Element) reduce(limbs *[5]uint64) *Element { - // Since the input limbs are bounded by 2^64, the biggest - // carry-out is bounded by 2^13. - // - // The biggest carry-in is c4 * 19, resulting in - // - // 2^51 + 19*2^13 < 2^51.0000000001 - // - // Because we don't need to canonicalize, only to reduce the - // limb sizes, it's OK to do a "weak reduction", where we - // compute the carry-outs in parallel. - - l0, l1, l2, l3, l4 := limbs[0], limbs[1], limbs[2], limbs[3], limbs[4] - - c0 := l0 >> 51 - c1 := l1 >> 51 - c2 := l2 >> 51 - c3 := l3 >> 51 - c4 := l4 >> 51 - - l0 &= low_51_bit_mask - l1 &= low_51_bit_mask - l2 &= low_51_bit_mask - l3 &= low_51_bit_mask - l4 &= low_51_bit_mask - - fe.inner[0] = l0 + c4*19 - fe.inner[1] = l1 + c0 - fe.inner[2] = l2 + c1 - fe.inner[3] = l3 + c2 - fe.inner[4] = l4 + c3 - - return fe -} - -// SetBytes loads a field element from the low 255 bits of a 256 bit input. -// -// WARNING: This function does not check that the input used the canonical -// representative. It masks the high bit, but it will happily decode -// 2^255 - 18 to 1. Applications that require a canonical encoding of -// every field element should decode, re-encode to the canonical encoding, -// and check that the input was canonical. -func (fe *Element) SetBytes(in []byte) (*Element, error) { - if len(in) != ElementSize { - return nil, fmt.Errorf("internal/field/u64: unexpected input size") - } - - _ = in[31] - *fe = Element{ - inner: [5]uint64{ - // load bits [ 0, 64), no shift - binary.LittleEndian.Uint64(in[0:8]) & low_51_bit_mask, - // load bits [ 48,112), shift to [ 51,112) - (binary.LittleEndian.Uint64(in[6:14]) >> 3) & low_51_bit_mask, - // load bits [ 96,160), shift to [102,160) - (binary.LittleEndian.Uint64(in[12:20]) >> 6) & low_51_bit_mask, - // load bits [152,216), shift to [153,216) - (binary.LittleEndian.Uint64(in[19:27]) >> 1) & low_51_bit_mask, - // load bits [192,256), shift to [204,112) - (binary.LittleEndian.Uint64(in[24:32]) >> 12) & low_51_bit_mask, - }, - } - - return fe, nil -} - -// SetBytesWide loads a field element from a 512-bit little-endian input. -func (fe *Element) SetBytesWide(in []byte) (*Element, error) { - if len(in) != ElementWideSize { - return nil, fmt.Errorf("internal/field/u64: unexpected input size") - } - - var lo, hi Element - if _, err := lo.SetBytes(in[:ElementSize]); err != nil { - return nil, fmt.Errorf("internal/field/u64: failed to deserialize lo: %w", err) - } - if _, err := hi.SetBytes(in[ElementSize:]); err != nil { - return nil, fmt.Errorf("internal/field/u64: failed to deserialize hi: %w", err) - } - - // Handle the 256th and 512th bits (MSB of lo and hi) explicitly - // as SetBytes ignores them. - lo.inner[0] += uint64(in[31]>>7)*19 + uint64(in[63]>>7)*2*19*19 - - lo.inner[0] += 2 * 19 * hi.inner[0] - lo.inner[1] += 2 * 19 * hi.inner[1] - lo.inner[2] += 2 * 19 * hi.inner[2] - lo.inner[3] += 2 * 19 * hi.inner[3] - lo.inner[4] += 2 * 19 * hi.inner[4] - - fe.reduce(&lo.inner) - - return fe, nil -} - -// ToBytes packs the field element into 32 bytes. The encoding is canonical. -func (fe *Element) ToBytes(out []byte) error { - if len(out) != ElementSize { - return fmt.Errorf("internal/field/u64: unexpected output size") - } - - // Let h = limbs[0] + limbs[1]*2^51 + ... + limbs[4]*2^204. - // - // Write h = pq + r with 0 <= r < p. - // - // We want to compute r = h mod p. - // - // If h < 2*p = 2^256 - 38, - // then q = 0 or 1, - // - // with q = 0 when h < p - // and q = 1 when h >= p. - // - // Notice that h >= p <==> h + 19 >= p + 19 <==> h + 19 >= 2^255. - // Therefore q can be computed as the carry bit of h + 19. - - // First, reduce the limbs to ensure h < 2*p. - var reduced Element - reduced.reduce(&fe.inner) - l0, l1, l2, l3, l4 := reduced.inner[0], reduced.inner[1], reduced.inner[2], reduced.inner[3], reduced.inner[4] - - q := (l0 + 19) >> 51 - q = (l1 + q) >> 51 - q = (l2 + q) >> 51 - q = (l3 + q) >> 51 - q = (l4 + q) >> 51 - - // Now we can compute r as r = h - pq = r - (2^255-19)q = r + 19q - 2^255q - - l0 += 19 * q - - // Now carry the result to compute r + 19q ... - l1 += l0 >> 51 - l0 = l0 & low_51_bit_mask - l2 += l1 >> 51 - l1 = l1 & low_51_bit_mask - l3 += l2 >> 51 - l2 = l2 & low_51_bit_mask - l4 += l3 >> 51 - l3 = l3 & low_51_bit_mask - // ... but instead of carrying (l4 >> 51) = 2^255q - // into another limb, discard it, subtracting the value - l4 = l4 & low_51_bit_mask - - out[0] = byte(l0) - out[1] = byte(l0 >> 8) - out[2] = byte(l0 >> 16) - out[3] = byte(l0 >> 24) - out[4] = byte(l0 >> 32) - out[5] = byte(l0 >> 40) - out[6] = byte((l0 >> 48) | (l1 << 3)) - out[7] = byte(l1 >> 5) - out[8] = byte(l1 >> 13) - out[9] = byte(l1 >> 21) - out[10] = byte(l1 >> 29) - out[11] = byte(l1 >> 37) - out[12] = byte((l1 >> 45) | (l2 << 6)) - out[13] = byte(l2 >> 2) - out[14] = byte(l2 >> 10) - out[15] = byte(l2 >> 18) - out[16] = byte(l2 >> 26) - out[17] = byte(l2 >> 34) - out[18] = byte(l2 >> 42) - out[19] = byte((l2 >> 50) | (l3 << 1)) - out[20] = byte(l3 >> 7) - out[21] = byte(l3 >> 15) - out[22] = byte(l3 >> 23) - out[23] = byte(l3 >> 31) - out[24] = byte(l3 >> 39) - out[25] = byte((l3 >> 47) | (l4 << 4)) - out[26] = byte(l4 >> 4) - out[27] = byte(l4 >> 12) - out[28] = byte(l4 >> 20) - out[29] = byte(l4 >> 28) - out[30] = byte(l4 >> 36) - out[31] = byte(l4 >> 44) - - return nil -} - -// Pow2k sets `fe = t^(2^k)`, given `k > 0`, and returns fe -func (fe *Element) Pow2k(t *Element, k uint) *Element { - if k == 0 { - panic("internal/field/u64: k out of bounds") - } - - fePow2k(fe, t, k) - return fe -} - -func fePow2kGeneric(fe, t *Element, k uint) { //nolint:unused,deadcode - a0, a1, a2, a3, a4 := t.inner[0], t.inner[1], t.inner[2], t.inner[3], t.inner[4] - - for { - // Precondition: assume input limbs a[i] are bounded as - // - // a[i] < 2^(51 + b) - // - // where b is a real parameter measuring the "bit excess" of the limbs. - - // Precomputation: 64-bit multiply by 19. - // - // This fits into a u64 whenever 51 + b + lg(19) < 64. - // - // Since 51 + b + lg(19) < 51 + 4.25 + b - // = 55.25 + b, - // this fits if b < 8.75. - a3_19 := 19 * a3 - a4_19 := 19 * a4 - - // Multiply to get 128-bit coefficients of output. - // - // Note: dalek just uses 128-bit multiplication here instead of - // doing some precomputation. Since Go does not have an actual - // 128-bit integer type, this will opt for precomputing, primarily - // for the sake of readability. - // - // This fits into a u64 whenever 51 + b + lg(1) < 64. - - d0 := 2 * a0 - d1 := 2 * a1 - d2 := 2 * a2 - d4 := 2 * a4 - - var carry uint64 - - c0_hi, c0_lo := bits.Mul64(a0, a0) - t0_hi, t0_lo := bits.Mul64(d1, a4_19) - c0_lo, carry = bits.Add64(c0_lo, t0_lo, 0) - c0_hi, _ = bits.Add64(c0_hi, t0_hi, carry) - t0_hi, t0_lo = bits.Mul64(d2, a3_19) - c0_lo, carry = bits.Add64(c0_lo, t0_lo, 0) - c0_hi, _ = bits.Add64(c0_hi, t0_hi, carry) - - c1_hi, c1_lo := bits.Mul64(a3, a3_19) - t1_hi, t1_lo := bits.Mul64(d0, a1) - c1_lo, carry = bits.Add64(c1_lo, t1_lo, 0) - c1_hi, _ = bits.Add64(c1_hi, t1_hi, carry) - t1_hi, t1_lo = bits.Mul64(d2, a4_19) - c1_lo, carry = bits.Add64(c1_lo, t1_lo, 0) - c1_hi, _ = bits.Add64(c1_hi, t1_hi, carry) - - c2_hi, c2_lo := bits.Mul64(a1, a1) - t2_hi, t2_lo := bits.Mul64(d0, a2) - c2_lo, carry = bits.Add64(c2_lo, t2_lo, 0) - c2_hi, _ = bits.Add64(c2_hi, t2_hi, carry) - t2_hi, t2_lo = bits.Mul64(d4, a3_19) - c2_lo, carry = bits.Add64(c2_lo, t2_lo, 0) - c2_hi, _ = bits.Add64(c2_hi, t2_hi, carry) - - c3_hi, c3_lo := bits.Mul64(a4, a4_19) - t3_hi, t3_lo := bits.Mul64(d0, a3) - c3_lo, carry = bits.Add64(c3_lo, t3_lo, 0) - c3_hi, _ = bits.Add64(c3_hi, t3_hi, carry) - t3_hi, t3_lo = bits.Mul64(d1, a2) - c3_lo, carry = bits.Add64(c3_lo, t3_lo, 0) - c3_hi, _ = bits.Add64(c3_hi, t3_hi, carry) - - c4_hi, c4_lo := bits.Mul64(a2, a2) - t4_hi, t4_lo := bits.Mul64(d0, a4) - c4_lo, carry = bits.Add64(c4_lo, t4_lo, 0) - c4_hi, _ = bits.Add64(c4_hi, t4_hi, carry) - t4_hi, t4_lo = bits.Mul64(d1, a3) - c4_lo, carry = bits.Add64(c4_lo, t4_lo, 0) - c4_hi, _ = bits.Add64(c4_hi, t4_hi, carry) - - // Same bound as in multiply: - // c[i] < 2^(102 + 2*b) * (1+i + (4-i)*19) - // < 2^(102 + lg(1 + 4*19) + 2*b) - // < 2^(108.27 + 2*b) - // - // The carry (c[i] >> 51) fits into a u64 when - // 108.27 + 2*b - 51 < 64 - // 2*b < 6.73 - // b < 3.365. - // - // So we require b < 3 to ensure this fits. - - tmp := (c0_hi << (64 - 51)) | (c0_lo >> 51) - c1_lo, carry = bits.Add64(c1_lo, tmp, 0) - c1_hi, _ = bits.Add64(c1_hi, 0, carry) - a0 = c0_lo & low_51_bit_mask - - tmp = (c1_hi << (64 - 51)) | (c1_lo >> 51) - c2_lo, carry = bits.Add64(c2_lo, tmp, 0) - c2_hi, _ = bits.Add64(c2_hi, 0, carry) - a1 = c1_lo & low_51_bit_mask - - tmp = (c2_hi << (64 - 51)) | (c2_lo >> 51) - c3_lo, carry = bits.Add64(c3_lo, tmp, 0) - c3_hi, _ = bits.Add64(c3_hi, 0, carry) - a2 = c2_lo & low_51_bit_mask - - tmp = (c3_hi << (64 - 51)) | (c3_lo >> 51) - c4_lo, carry = bits.Add64(c4_lo, tmp, 0) - c4_hi, _ = bits.Add64(c4_hi, 0, carry) - a3 = c3_lo & low_51_bit_mask - - carry = (c4_hi << (64 - 51)) | (c4_lo >> 51) - a4 = c4_lo & low_51_bit_mask - - // To see that this does not overflow, we need a[0] + carry * 19 < 2^64. - // - // c4 < a2^2 + 2*a0*a4 + 2*a1*a3 + (carry from c3) - // < 2^(102 + 2*b + lg(5)) + 2^64. - // - // When b < 3 we get - // - // c4 < 2^110.33 so that carry < 2^59.33 - // - // so that - // - // a[0] + carry * 19 < 2^51 + 19 * 2^59.33 < 2^63.58 - // - // and there is no overflow. - a0 = a0 + carry*19 - - // Now a[1] < 2^51 + 2^(64 -51) = 2^51 + 2^13 < 2^(51 + epsilon). - a1 += a0 >> 51 - a0 &= low_51_bit_mask - - // Now all a[i] < 2^(51 + epsilon) and a = self^(2^k). - - k-- - if k == 0 { - break - } - } - - fe.inner[0], fe.inner[1], fe.inner[2], fe.inner[3], fe.inner[4] = a0, a1, a2, a3, a4 -} - -// Square sets `fe = t^2`, and returns fe. -func (fe *Element) Square(t *Element) *Element { - fePow2k(fe, t, 1) - return fe -} - -// Square2 sets `fe = 2*t^2`, and returns fe. -func (fe *Element) Square2(t *Element) *Element { - fePow2k(fe, t, 1) - for i := 0; i < 5; i++ { - fe.inner[i] *= 2 - } - return fe -} - -// UnsafeInner exposes the inner limbs to allow for the vector implementation. -func (fe *Element) UnsafeInner() *[5]uint64 { - return &fe.inner -} - // NewElement51 constructs a field element from its raw component limbs. func NewElement51(l0, l1, l2, l3, l4 uint64) Element { return Element{ diff --git a/internal/field/field_u64_amd64.go b/internal/field/field_u64_amd64.go deleted file mode 100644 index ad7db14..0000000 --- a/internal/field/field_u64_amd64.go +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright (c) 2021 Oasis Labs Inc. All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the copyright holder nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS -// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED -// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A -// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED -// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -//go:build amd64 && !purego && !force32bit -// +build amd64,!purego,!force32bit - -package field - -//go:noescape -func feMul(out, a, b *Element) - -//go:noescape -func fePow2k(out, a *Element, k uint) diff --git a/internal/field/field_u64_amd64.s b/internal/field/field_u64_amd64.s deleted file mode 100644 index d6d4541..0000000 --- a/internal/field/field_u64_amd64.s +++ /dev/null @@ -1,401 +0,0 @@ -// Code generated by command: go run field_u64.go. DO NOT EDIT. - -// +build amd64,!purego,!force32bit - -#include "textflag.h" - -// func feMul(out *Element, a *Element, b *Element) -TEXT ·feMul(SB), NOSPLIT|NOFRAME, $0-24 - // Based on assembly generated by PeachPy. Equivalent to the Go in - // feMulGeneric, which was originally based on the amd64-51-30k - // assembly in SUPERCOP. - MOVQ a+8(FP), CX - MOVQ b+16(FP), BX - - // r0 = x0*y0 + x1_19*y4 + x2_19*y3 + x3_19*y2 + x4_19*y1 - - // r00, r01 = x0*y0 - MOVQ (CX), AX - MULQ (BX) - MOVQ AX, SI - MOVQ DX, DI - - // r00, r01 += x1_19*y4 - MOVQ 8(CX), DX - IMUL3Q $0x13, DX, AX - MULQ 32(BX) - ADDQ AX, SI - ADCQ DX, DI - - // r00, r01 += x2_19*y3 - MOVQ 16(CX), DX - IMUL3Q $0x13, DX, AX - MULQ 24(BX) - ADDQ AX, SI - ADCQ DX, DI - - // r00, r01 += x3_19*y2 - MOVQ 24(CX), DX - IMUL3Q $0x13, DX, AX - MULQ 16(BX) - ADDQ AX, SI - ADCQ DX, DI - - // r00, r01 += x4_19*y1 - MOVQ 32(CX), DX - IMUL3Q $0x13, DX, AX - MULQ 8(BX) - ADDQ AX, SI - ADCQ DX, DI - - // r1 = x0*y1 + x1*y0 + x2_19*y4 + x3_19*y3 + x4_19*y2 - - // r10, r11 = x0*y1 - MOVQ (CX), AX - MULQ 8(BX) - MOVQ AX, R8 - MOVQ DX, R9 - - // r10, r11 += x1*y0 - MOVQ 8(CX), AX - MULQ (BX) - ADDQ AX, R8 - ADCQ DX, R9 - - // r10, r11 += x2_19*y4 - MOVQ 16(CX), DX - IMUL3Q $0x13, DX, AX - MULQ 32(BX) - ADDQ AX, R8 - ADCQ DX, R9 - - // r10, r11 += x3_19*y3 - MOVQ 24(CX), DX - IMUL3Q $0x13, DX, AX - MULQ 24(BX) - ADDQ AX, R8 - ADCQ DX, R9 - - // r10, r11 += x4_19*y2 - MOVQ 32(CX), DX - IMUL3Q $0x13, DX, AX - MULQ 16(BX) - ADDQ AX, R8 - ADCQ DX, R9 - - // r2 = x0*y2 + x1*y1 + x2*y0 + x3_19*y4 + x4_19*y3 - - // r20, r11 = x0*y2 - MOVQ (CX), AX - MULQ 16(BX) - MOVQ AX, R10 - MOVQ DX, R11 - - // r20, r21 += x1*y1 - MOVQ 8(CX), AX - MULQ 8(BX) - ADDQ AX, R10 - ADCQ DX, R11 - - // r20, r21 += x2*y0 - MOVQ 16(CX), AX - MULQ (BX) - ADDQ AX, R10 - ADCQ DX, R11 - - // r20, r21 += x3_19*y4 - MOVQ 24(CX), DX - IMUL3Q $0x13, DX, AX - MULQ 32(BX) - ADDQ AX, R10 - ADCQ DX, R11 - - // r20, r21 += x4_19*y3 - MOVQ 32(CX), DX - IMUL3Q $0x13, DX, AX - MULQ 24(BX) - ADDQ AX, R10 - ADCQ DX, R11 - - // r3 = x0*y3 + x1*y2 + x2*y1 + x3*y0 + x4_19*y4 - - // r30, r31 = x0*y3 - MOVQ (CX), AX - MULQ 24(BX) - MOVQ AX, R12 - MOVQ DX, R13 - - // r30, r31 += x1*y2 - MOVQ 8(CX), AX - MULQ 16(BX) - ADDQ AX, R12 - ADCQ DX, R13 - - // r30, r31 += x2*y1 - MOVQ 16(CX), AX - MULQ 8(BX) - ADDQ AX, R12 - ADCQ DX, R13 - - // r30, r31 += x3*y0 - MOVQ 24(CX), AX - MULQ (BX) - ADDQ AX, R12 - ADCQ DX, R13 - - // r30, r31 += x4_19*y4 - MOVQ 32(CX), DX - IMUL3Q $0x13, DX, AX - MULQ 32(BX) - ADDQ AX, R12 - ADCQ DX, R13 - - // r3 = x0*y4 + x1*y3 + x2*y2 + x3*y1 + x4*y0 - - // r40, r31 = x0*y4 - MOVQ (CX), AX - MULQ 32(BX) - MOVQ AX, R14 - MOVQ DX, R15 - - // r40, r41 += x1*y3 - MOVQ 8(CX), AX - MULQ 24(BX) - ADDQ AX, R14 - ADCQ DX, R15 - - // r40, r41 += x2*y2 - MOVQ 16(CX), AX - MULQ 16(BX) - ADDQ AX, R14 - ADCQ DX, R15 - - // r40, r41 += x3*y1 - MOVQ 24(CX), AX - MULQ 8(BX) - ADDQ AX, R14 - ADCQ DX, R15 - - // r40, r41 += x4*y0 - MOVQ 32(CX), AX - MULQ (BX) - ADDQ AX, R14 - ADCQ DX, R15 - - // Reduce - MOVQ $0x0007ffffffffffff, AX - SHLQ $0x0d, SI, DI - ANDQ AX, SI - SHLQ $0x0d, R8, R9 - ANDQ AX, R8 - ADDQ DI, R8 - SHLQ $0x0d, R10, R11 - ANDQ AX, R10 - ADDQ R9, R10 - SHLQ $0x0d, R12, R13 - ANDQ AX, R12 - ADDQ R11, R12 - SHLQ $0x0d, R14, R15 - ANDQ AX, R14 - ADDQ R13, R14 - IMUL3Q $0x13, R15, R15 - ADDQ R15, SI - MOVQ SI, DI - MOVQ R8, R9 - MOVQ R10, R11 - MOVQ R12, R13 - MOVQ R14, R15 - ANDQ AX, SI - ANDQ AX, R8 - ANDQ AX, R10 - ANDQ AX, R12 - ANDQ AX, R14 - SHRQ $0x33, DI - SHRQ $0x33, R9 - SHRQ $0x33, R11 - SHRQ $0x33, R13 - SHRQ $0x33, R15 - IMUL3Q $0x13, R15, R15 - ADDQ DI, R8 - ADDQ R9, R10 - ADDQ R11, R12 - ADDQ R13, R14 - ADDQ R15, SI - - // Write out the results - MOVQ out+0(FP), AX - MOVQ SI, (AX) - MOVQ R8, 8(AX) - MOVQ R10, 16(AX) - MOVQ R12, 24(AX) - MOVQ R14, 32(AX) - RET - -// func fePow2k(out *Element, a *Element, k uint64) -TEXT ·fePow2k(SB), NOSPLIT|NOFRAME, $0-24 - MOVQ a+8(FP), CX - MOVQ k+16(FP), BX - -pow2k_loop: - // r0 = x0*x0 + x1*38*x4 + x2*38*x3 - - // r00, r01 = x0*x0 - MOVQ (CX), AX - MULQ (CX) - MOVQ AX, SI - MOVQ DX, DI - - // r00, r01 += x1*38*x4 - MOVQ 8(CX), DX - IMUL3Q $0x26, DX, AX - MULQ 32(CX) - ADDQ AX, SI - ADCQ DX, DI - - // r00, r01 += x2*38*x3 - MOVQ 16(CX), DX - IMUL3Q $0x26, DX, AX - MULQ 24(CX) - ADDQ AX, SI - ADCQ DX, DI - - // r1 = x0*2*x1 + x2*38*x4 + x3*19*x3 - - // r10, r11 = x0*2*x1 - MOVQ (CX), AX - SHLQ $0x01, AX - MULQ 8(CX) - MOVQ AX, R8 - MOVQ DX, R9 - - // r10, r11 += x2*38*x4 - MOVQ 16(CX), DX - IMUL3Q $0x26, DX, AX - MULQ 32(CX) - ADDQ AX, R8 - ADCQ DX, R9 - - // r10, r11 += x3*19*x3 - MOVQ 24(CX), DX - IMUL3Q $0x13, DX, AX - MULQ 24(CX) - ADDQ AX, R8 - ADCQ DX, R9 - - // r2 = x0*2*x2 + x1*x1 + x3*38*x4 - - // r20, r21 = x0*2*x2 - MOVQ (CX), AX - SHLQ $0x01, AX - MULQ 16(CX) - MOVQ AX, R10 - MOVQ DX, R11 - - // r20, r21 += x1*x1 - MOVQ 8(CX), AX - MULQ 8(CX) - ADDQ AX, R10 - ADCQ DX, R11 - - // r20, r21 += x3*38*x4 - MOVQ 24(CX), DX - IMUL3Q $0x26, DX, AX - MULQ 32(CX) - ADDQ AX, R10 - ADCQ DX, R11 - - // r3 = x0*2*x3 + x1*2*x2 + x4*19*x4 - - // r30, r31 = x0*2*x3 - MOVQ (CX), AX - SHLQ $0x01, AX - MULQ 24(CX) - MOVQ AX, R12 - MOVQ DX, R13 - - // r30, r31 += x1*2*x2 - MOVQ 8(CX), AX - SHLQ $0x01, AX - MULQ 16(CX) - ADDQ AX, R12 - ADCQ DX, R13 - - // r30, r31 += x4*19*x4 - MOVQ 32(CX), DX - IMUL3Q $0x13, DX, AX - MULQ 32(CX) - ADDQ AX, R12 - ADCQ DX, R13 - - // r4 = x0*2*x4 + x1*2*x3 + x2*x2 - - // r40, r41 = x0*2*x4 - MOVQ (CX), AX - SHLQ $0x01, AX - MULQ 32(CX) - MOVQ AX, R14 - MOVQ DX, R15 - - // r40, r41 += x1*2*x3 - MOVQ 8(CX), AX - SHLQ $0x01, AX - MULQ 24(CX) - ADDQ AX, R14 - ADCQ DX, R15 - - // r40, r41 += x2*x2 - MOVQ 16(CX), AX - MULQ 16(CX) - ADDQ AX, R14 - ADCQ DX, R15 - - // Reduce - MOVQ $0x0007ffffffffffff, AX - SHLQ $0x0d, SI, DI - ANDQ AX, SI - SHLQ $0x0d, R8, R9 - ANDQ AX, R8 - ADDQ DI, R8 - SHLQ $0x0d, R10, R11 - ANDQ AX, R10 - ADDQ R9, R10 - SHLQ $0x0d, R12, R13 - ANDQ AX, R12 - ADDQ R11, R12 - SHLQ $0x0d, R14, R15 - ANDQ AX, R14 - ADDQ R13, R14 - IMUL3Q $0x13, R15, R15 - ADDQ R15, SI - MOVQ SI, DI - MOVQ R8, R9 - MOVQ R10, R11 - MOVQ R12, R13 - MOVQ R14, R15 - ANDQ AX, SI - ANDQ AX, R8 - ANDQ AX, R10 - ANDQ AX, R12 - ANDQ AX, R14 - SHRQ $0x33, DI - SHRQ $0x33, R9 - SHRQ $0x33, R11 - SHRQ $0x33, R13 - SHRQ $0x33, R15 - IMUL3Q $0x13, R15, R15 - ADDQ DI, R8 - ADDQ R9, R10 - ADDQ R11, R12 - ADDQ R13, R14 - ADDQ R15, SI - - // Write out the results - MOVQ out+0(FP), CX - MOVQ SI, (CX) - MOVQ R8, 8(CX) - MOVQ R10, 16(CX) - MOVQ R12, 24(CX) - MOVQ R14, 32(CX) - DECQ BX - JNZ pow2k_loop - RET diff --git a/internal/field/field_u64_amd64_test.go b/internal/field/field_u64_amd64_test.go deleted file mode 100644 index 42773df..0000000 --- a/internal/field/field_u64_amd64_test.go +++ /dev/null @@ -1,191 +0,0 @@ -// Copyright (c) 2017 George Tankersley. All rights reserved. -// Copyright (c) 2021 Oasis Labs Inc. All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the copyright holder nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS -// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED -// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A -// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED -// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -//go:build amd64 && !purego && !force32bit -// +build amd64,!purego,!force32bit - -package field - -import ( - "math/rand" - "reflect" - "testing" - "testing/quick" -) - -// quickCheckConfig will make each quickcheck test run (1024 * -quickchecks) -// times. The default value of -quickchecks is 100. -var quickCheckConfig = &quick.Config{MaxCountScale: 1 << 10} - -func generateElement(rand *rand.Rand) Element { - // Generation strategy: generate random limb values of [52, 51, 51, 51, 51] - // bits, like the ones returned by lightReduce. - const low_52_bit_mask = (1 << 52) - 1 - return NewElement51( - rand.Uint64()&low_52_bit_mask, - rand.Uint64()&low_51_bit_mask, - rand.Uint64()&low_51_bit_mask, - rand.Uint64()&low_51_bit_mask, - rand.Uint64()&low_51_bit_mask, - ) -} - -// weirdLimbs can be combined to generate a range of edge-case field elements. -// 0 and -1 are intentionally more weighted, as they combine well. -var ( - weirdLimbs51 = []uint64{ - 0, 0, 0, 0, - 1, - 19 - 1, - 19, - 0x2aaaaaaaaaaaa, - 0x5555555555555, - (1 << 51) - 20, - (1 << 51) - 19, - (1 << 51) - 1, (1 << 51) - 1, - (1 << 51) - 1, (1 << 51) - 1, - } - weirdLimbs52 = []uint64{ - 0, 0, 0, 0, 0, 0, - 1, - 19 - 1, - 19, - 0x2aaaaaaaaaaaa, - 0x5555555555555, - (1 << 51) - 20, - (1 << 51) - 19, - (1 << 51) - 1, (1 << 51) - 1, - (1 << 51) - 1, (1 << 51) - 1, - (1 << 51) - 1, (1 << 51) - 1, - 1 << 51, - (1 << 51) + 1, - (1 << 52) - 19, - (1 << 52) - 1, - } -) - -func generateWeirdElement(rand *rand.Rand) Element { - return NewElement51( - weirdLimbs52[rand.Intn(len(weirdLimbs52))], - weirdLimbs51[rand.Intn(len(weirdLimbs51))], - weirdLimbs51[rand.Intn(len(weirdLimbs51))], - weirdLimbs51[rand.Intn(len(weirdLimbs51))], - weirdLimbs51[rand.Intn(len(weirdLimbs51))], - ) -} - -func (x Element) Generate(rand *rand.Rand, size int) reflect.Value { - if rand.Intn(2) == 0 { - return reflect.ValueOf(generateWeirdElement(rand)) - } - return reflect.ValueOf(generateElement(rand)) -} - -// isInAsmBounds returns whether the element is within the expected bit -// size bounds after a light reduction, based on the behavior of -// the amd64 specific assembly multiply/pow2k routines. -func isInAsmBounds(x *Element) bool { - const ( - l0Max = 1<<51 + 155629 - l14Max = 1<<51 + 8191 - ) - - return x.inner[0] < l0Max && - x.inner[1] < l14Max && - x.inner[2] < l14Max && - x.inner[3] < l14Max && - x.inner[4] < l14Max -} - -func TestFeMulAsm(t *testing.T) { - t.Run("FeMul/mul", func(t *testing.T) { - testFeMul(t) - }) - t.Run("FePow2k/mul", func(t *testing.T) { - testFePow2k(t) - }) -} - -func testFeMul(t *testing.T) { - mulDistributesOverAdd := func(x, y, z Element) bool { - var t1, t2, t3, t1Asm, t2Asm, t3Asm Element - - // Note: The coefficients are allowed to grow up to 2^54 - // between reductions, which is what the generic mul - // implementation does. - // - // The assembly reduces to 2^[51,52], which is different, - // but still correct as the shorter coefficients will not - // cause overflows. - // - // Attempts were made to make the assembly match the - // generic code exactly, but it ended up being slightly - // slower. - - // Compute t1 = (x+y)*z - t1.Add(&x, &y) - feMul(&t1Asm, &t1, &z) - feMulGeneric(&t1, &t1, &z) - if t1.Equal(&t1Asm) != 1 || !isInAsmBounds(&t1Asm) { - return false - } - - // Compute t2 = x*z + y*z - feMul(&t2Asm, &x, &z) - feMul(&t3Asm, &y, &z) - feMulGeneric(&t2, &x, &z) - feMulGeneric(&t3, &y, &z) - if t2.Equal(&t2Asm) != 1 || !isInAsmBounds(&t2Asm) { - return false - } - if t3.Equal(&t3Asm) != 1 || !isInAsmBounds(&t3Asm) { - return false - } - t2.Add(&t2, &t3) - t2Asm.Add(&t2Asm, &t3Asm) - - return t1.Equal(&t2) == 1 && t2Asm.Equal(&t1) == 1 && t1Asm.Equal(&t2) == 1 - } - - if err := quick.Check(mulDistributesOverAdd, quickCheckConfig); err != nil { - t.Error(err) - } -} - -func testFePow2k(t *testing.T) { - a, ap16 := testConstants["A"], testConstants["AP16"] - - var shouldBeAp16 Element - fePow2k(&shouldBeAp16, a, 4) - - if shouldBeAp16.Equal(ap16) != 1 { - t.Fatalf("a ^ (2^4) != ap16 (Got: %v)", shouldBeAp16) - } -} diff --git a/internal/field/field_u64_generic.go b/internal/field/field_u64_generic.go deleted file mode 100644 index 3c18ca6..0000000 --- a/internal/field/field_u64_generic.go +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright (c) 2021 Oasis Labs Inc. All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the copyright holder nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS -// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED -// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A -// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED -// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -//go:build (purego || (!amd64 && force64bit) || arm64 || ppc64le || ppc64 || s390x) && !force32bit -// +build purego !amd64,force64bit arm64 ppc64le ppc64 s390x -// +build !force32bit - -package field - -func feMul(fe, a, b *Element) { - feMulGeneric(fe, a, b) -} - -func fePow2k(fe, t *Element, k uint) { - fePow2kGeneric(fe, t, k) -} From 09a34d4d16607efd4685f2f1c1f1cf6620caf007 Mon Sep 17 00:00:00 2001 From: Yawning Angel Date: Mon, 2 Aug 2021 13:25:04 +0000 Subject: [PATCH 3/3] primitives/x25519: Remove the `x/crypto/curve25519` fallback Upstream got rid of the assembly. This is marginally slower, but it will use fiat, and it's only a few percent. --- primitives/x25519/x25519.go | 16 +----------- primitives/x25519/x25519_amd64.go | 38 ----------------------------- primitives/x25519/x25519_generic.go | 35 -------------------------- primitives/x25519/x25519_test.go | 27 -------------------- 4 files changed, 1 insertion(+), 115 deletions(-) delete mode 100644 primitives/x25519/x25519_amd64.go delete mode 100644 primitives/x25519/x25519_generic.go diff --git a/primitives/x25519/x25519.go b/primitives/x25519/x25519.go index 29a77ae..a9862c3 100644 --- a/primitives/x25519/x25519.go +++ b/primitives/x25519/x25519.go @@ -37,8 +37,6 @@ import ( "crypto/subtle" "fmt" - xcurve "golang.org/x/crypto/curve25519" - "github.com/oasisprotocol/curve25519-voi/curve" "github.com/oasisprotocol/curve25519-voi/curve/scalar" _ "github.com/oasisprotocol/curve25519-voi/internal/toolchain" @@ -55,11 +53,7 @@ const ( // Basepoint is the canonical Curve25519 generator. var Basepoint []byte -var ( - basePoint = [32]byte{9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} - - debugNoXcurve bool -) +var basePoint = [32]byte{9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} // ScalarMult sets dst to the product in*base where dst and base are the x // coordinates of group points and all values are in little-endian form. @@ -68,14 +62,6 @@ var ( // zeroes, irrespective of the scalar. Instead, use the X25519 function, which // will return an error. func ScalarMult(dst, in, base *[32]byte) { - // If the `x/crypto/curve25519` package would be faster, and we - // are not exercising the implementation provided by this package - // (eg: testing or benchmarking), use that instead. - if xcurveFaster && !debugNoXcurve { - xcurve.ScalarMult(dst, in, base) - return - } - var ec [ScalarSize]byte copy(ec[:], in[:]) clampScalar(ec[:]) diff --git a/primitives/x25519/x25519_amd64.go b/primitives/x25519/x25519_amd64.go deleted file mode 100644 index 23d9d21..0000000 --- a/primitives/x25519/x25519_amd64.go +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright (c) 2021 Oasis Labs Inc. All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the copyright holder nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS -// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED -// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A -// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED -// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -//go:build amd64 && !purego -// +build amd64,!purego - -package x25519 - -// If this is amd64, and assembly is not disabled via build tags, just -// use `x/crypto/curve25519`'s scalar multiply, because it will be -// faster by virtue of being entirely in assembly. -const xcurveFaster = true diff --git a/primitives/x25519/x25519_generic.go b/primitives/x25519/x25519_generic.go deleted file mode 100644 index 30e2b21..0000000 --- a/primitives/x25519/x25519_generic.go +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright (c) 2021 Oasis Labs Inc. All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the copyright holder nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS -// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED -// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A -// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED -// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -//go:build !amd64 || purego -// +build !amd64 purego - -package x25519 - -const xcurveFaster = false diff --git a/primitives/x25519/x25519_test.go b/primitives/x25519/x25519_test.go index 66c9a04..3176dc9 100644 --- a/primitives/x25519/x25519_test.go +++ b/primitives/x25519/x25519_test.go @@ -66,15 +66,6 @@ func TestScalarBaseMult(t *testing.T) { func TestX25519(t *testing.T) { t.Run("voi", testX25519) - if xcurveFaster { - t.Run("voi/debugNoXcurve", func(t *testing.T) { - debugNoXcurve = true - defer func() { - debugNoXcurve = false - }() - testX25519(t) - }) - } } func testX25519(t *testing.T) { @@ -166,15 +157,6 @@ func testTestVectors(t *testing.T, scalarMult func(dst, scalar, point *[32]byte) func TestScalarMult(t *testing.T) { t.Run("voi", testScalarMult) - if xcurveFaster { - t.Run("voi/debugNoXcurve", func(t *testing.T) { - debugNoXcurve = true - defer func() { - debugNoXcurve = false - }() - testScalarMult(t) - }) - } } func testScalarMult(t *testing.T) { @@ -245,15 +227,6 @@ func benchScalarBaseMult(b *testing.B, scalarBaseMult func(dst, scalar *[32]byte func BenchmarkScalarMult(b *testing.B) { b.Run("voi", func(b *testing.B) { benchScalarMult(b, ScalarMult) }) - if xcurveFaster { - b.Run("voi/debugNoXcurve", func(b *testing.B) { - debugNoXcurve = true - defer func() { - debugNoXcurve = false - }() - benchScalarMult(b, ScalarMult) - }) - } b.Run("xcrypto", func(b *testing.B) { benchScalarMult(b, xcurve.ScalarMult) //nolint:staticcheck })