diff --git a/blake2b/blake2bAVX2_amd64.s b/blake2b/blake2bAVX2_amd64.s index 8608a7f7d1..a78ab3b3d9 100644 --- a/blake2b/blake2bAVX2_amd64.s +++ b/blake2b/blake2bAVX2_amd64.s @@ -282,14 +282,12 @@ TEXT ·hashBlocksAVX2(SB), 4, $320-48 // frame size = 288 + 32 byte alignment MOVQ blocks_len+32(FP), DI MOVQ SP, DX - MOVQ SP, R9 - ADDQ $31, R9 - ANDQ $~31, R9 - MOVQ R9, SP + ADDQ $31, DX + ANDQ $~31, DX - MOVQ CX, 16(SP) + MOVQ CX, 16(DX) XORQ CX, CX - MOVQ CX, 24(SP) + MOVQ CX, 24(DX) VMOVDQU ·AVX2_c40<>(SB), Y4 VMOVDQU ·AVX2_c48<>(SB), Y5 @@ -301,33 +299,33 @@ TEXT ·hashBlocksAVX2(SB), 4, $320-48 // frame size = 288 + 32 byte alignment MOVQ 0(BX), R8 MOVQ 8(BX), R9 - MOVQ R9, 8(SP) + MOVQ R9, 8(DX) loop: ADDQ $128, R8 - MOVQ R8, 0(SP) + MOVQ R8, 0(DX) CMPQ R8, $128 JGE noinc INCQ R9 - MOVQ R9, 8(SP) + MOVQ R9, 8(DX) noinc: VMOVDQA Y8, Y0 VMOVDQA Y9, Y1 VMOVDQA Y6, Y2 - VPXOR 0(SP), Y7, Y3 + VPXOR 0(DX), Y7, Y3 LOAD_MSG_AVX2_0_2_4_6_1_3_5_7_8_10_12_14_9_11_13_15() - VMOVDQA Y12, 32(SP) - VMOVDQA Y13, 64(SP) - VMOVDQA Y14, 96(SP) - VMOVDQA Y15, 128(SP) + VMOVDQA Y12, 32(DX) + VMOVDQA Y13, 64(DX) + VMOVDQA Y14, 96(DX) + VMOVDQA Y15, 128(DX) ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) LOAD_MSG_AVX2_14_4_9_13_10_8_15_6_1_0_11_5_12_2_7_3() - VMOVDQA Y12, 160(SP) - VMOVDQA Y13, 192(SP) - VMOVDQA Y14, 224(SP) - VMOVDQA Y15, 256(SP) + VMOVDQA Y12, 160(DX) + VMOVDQA Y13, 192(DX) + VMOVDQA Y14, 224(DX) + VMOVDQA Y15, 256(DX) ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) LOAD_MSG_AVX2_11_12_5_15_8_0_2_13_10_3_7_9_14_6_1_4() @@ -347,8 +345,8 @@ noinc: LOAD_MSG_AVX2_10_8_7_1_2_4_6_5_15_9_3_13_11_14_12_0() ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) - ROUND_AVX2(32(SP), 64(SP), 96(SP), 128(SP), Y10, Y4, Y5) - ROUND_AVX2(160(SP), 192(SP), 224(SP), 256(SP), Y10, Y4, Y5) + ROUND_AVX2(32(DX), 64(DX), 96(DX), 128(DX), Y10, Y4, Y5) + ROUND_AVX2(160(DX), 192(DX), 224(DX), 256(DX), Y10, Y4, Y5) VPXOR Y0, Y8, Y8 VPXOR Y1, Y9, Y9 @@ -366,7 +364,6 @@ noinc: VMOVDQU Y9, 32(AX) VZEROUPPER - MOVQ DX, SP RET #define VPUNPCKLQDQ_X2_X2_X15 BYTE $0xC5; BYTE $0x69; BYTE $0x6C; BYTE $0xFA @@ -584,11 +581,9 @@ TEXT ·hashBlocksAVX(SB), 4, $288-48 // frame size = 272 + 16 byte alignment MOVQ blocks_base+24(FP), SI MOVQ blocks_len+32(FP), DI - MOVQ SP, BP - MOVQ SP, R9 - ADDQ $15, R9 - ANDQ $~15, R9 - MOVQ R9, SP + MOVQ SP, R10 + ADDQ $15, R10 + ANDQ $~15, R10 VMOVDQU ·AVX_c40<>(SB), X0 VMOVDQU ·AVX_c48<>(SB), X1 @@ -596,8 +591,8 @@ TEXT ·hashBlocksAVX(SB), 4, $288-48 // frame size = 272 + 16 byte alignment VMOVDQA X1, X9 VMOVDQU ·AVX_iv3<>(SB), X0 - VMOVDQA X0, 0(SP) - XORQ CX, 0(SP) // 0(SP) = ·AVX_iv3 ^ (CX || 0) + VMOVDQA X0, 0(R10) + XORQ CX, 0(R10) // 0(R10) = ·AVX_iv3 ^ (CX || 0) VMOVDQU 0(AX), X10 VMOVDQU 16(AX), X11 @@ -624,35 +619,35 @@ noinc: VMOVDQU ·AVX_iv2<>(SB), X6 VPXOR X15, X6, X6 - VMOVDQA 0(SP), X7 + VMOVDQA 0(R10), X7 LOAD_MSG_AVX_0_2_4_6_1_3_5_7() - VMOVDQA X12, 16(SP) - VMOVDQA X13, 32(SP) - VMOVDQA X14, 48(SP) - VMOVDQA X15, 64(SP) + VMOVDQA X12, 16(R10) + VMOVDQA X13, 32(R10) + VMOVDQA X14, 48(R10) + VMOVDQA X15, 64(R10) HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) SHUFFLE_AVX() LOAD_MSG_AVX(8, 10, 12, 14, 9, 11, 13, 15) - VMOVDQA X12, 80(SP) - VMOVDQA X13, 96(SP) - VMOVDQA X14, 112(SP) - VMOVDQA X15, 128(SP) + VMOVDQA X12, 80(R10) + VMOVDQA X13, 96(R10) + VMOVDQA X14, 112(R10) + VMOVDQA X15, 128(R10) HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) SHUFFLE_AVX_INV() LOAD_MSG_AVX(14, 4, 9, 13, 10, 8, 15, 6) - VMOVDQA X12, 144(SP) - VMOVDQA X13, 160(SP) - VMOVDQA X14, 176(SP) - VMOVDQA X15, 192(SP) + VMOVDQA X12, 144(R10) + VMOVDQA X13, 160(R10) + VMOVDQA X14, 176(R10) + VMOVDQA X15, 192(R10) HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) SHUFFLE_AVX() LOAD_MSG_AVX_1_0_11_5_12_2_7_3() - VMOVDQA X12, 208(SP) - VMOVDQA X13, 224(SP) - VMOVDQA X14, 240(SP) - VMOVDQA X15, 256(SP) + VMOVDQA X12, 208(R10) + VMOVDQA X13, 224(R10) + VMOVDQA X14, 240(R10) + VMOVDQA X15, 256(R10) HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) SHUFFLE_AVX_INV() @@ -712,14 +707,14 @@ noinc: HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) SHUFFLE_AVX_INV() - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 16(SP), 32(SP), 48(SP), 64(SP), X15, X8, X9) + HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 16(R10), 32(R10), 48(R10), 64(R10), X15, X8, X9) SHUFFLE_AVX() - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 80(SP), 96(SP), 112(SP), 128(SP), X15, X8, X9) + HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 80(R10), 96(R10), 112(R10), 128(R10), X15, X8, X9) SHUFFLE_AVX_INV() - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 144(SP), 160(SP), 176(SP), 192(SP), X15, X8, X9) + HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 144(R10), 160(R10), 176(R10), 192(R10), X15, X8, X9) SHUFFLE_AVX() - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 208(SP), 224(SP), 240(SP), 256(SP), X15, X8, X9) + HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 208(R10), 224(R10), 240(R10), 256(R10), X15, X8, X9) SHUFFLE_AVX_INV() VMOVDQU 32(AX), X14 @@ -746,5 +741,4 @@ noinc: MOVQ R9, 8(BX) VZEROUPPER - MOVQ BP, SP RET diff --git a/blake2b/blake2b_amd64.s b/blake2b/blake2b_amd64.s index 1f4c6a9279..bb72a03913 100644 --- a/blake2b/blake2b_amd64.s +++ b/blake2b/blake2b_amd64.s @@ -118,15 +118,13 @@ TEXT ·hashBlocksSSE4(SB), 4, $288-48 // frame size = 272 + 16 byte alignment MOVQ blocks_base+24(FP), SI MOVQ blocks_len+32(FP), DI - MOVQ SP, BP - MOVQ SP, R9 - ADDQ $15, R9 - ANDQ $~15, R9 - MOVQ R9, SP + MOVQ SP, R10 + ADDQ $15, R10 + ANDQ $~15, R10 MOVOU ·iv3<>(SB), X0 - MOVO X0, 0(SP) - XORQ CX, 0(SP) // 0(SP) = ·iv3 ^ (CX || 0) + MOVO X0, 0(R10) + XORQ CX, 0(R10) // 0(R10) = ·iv3 ^ (CX || 0) MOVOU ·c40<>(SB), X13 MOVOU ·c48<>(SB), X14 @@ -156,35 +154,35 @@ noinc: MOVOU ·iv2<>(SB), X6 PXOR X8, X6 - MOVO 0(SP), X7 + MOVO 0(R10), X7 LOAD_MSG(X8, X9, X10, X11, SI, 0, 2, 4, 6, 1, 3, 5, 7) - MOVO X8, 16(SP) - MOVO X9, 32(SP) - MOVO X10, 48(SP) - MOVO X11, 64(SP) + MOVO X8, 16(R10) + MOVO X9, 32(R10) + MOVO X10, 48(R10) + MOVO X11, 64(R10) HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) LOAD_MSG(X8, X9, X10, X11, SI, 8, 10, 12, 14, 9, 11, 13, 15) - MOVO X8, 80(SP) - MOVO X9, 96(SP) - MOVO X10, 112(SP) - MOVO X11, 128(SP) + MOVO X8, 80(R10) + MOVO X9, 96(R10) + MOVO X10, 112(R10) + MOVO X11, 128(R10) HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) LOAD_MSG(X8, X9, X10, X11, SI, 14, 4, 9, 13, 10, 8, 15, 6) - MOVO X8, 144(SP) - MOVO X9, 160(SP) - MOVO X10, 176(SP) - MOVO X11, 192(SP) + MOVO X8, 144(R10) + MOVO X9, 160(R10) + MOVO X10, 176(R10) + MOVO X11, 192(R10) HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) LOAD_MSG(X8, X9, X10, X11, SI, 1, 0, 11, 5, 12, 2, 7, 3) - MOVO X8, 208(SP) - MOVO X9, 224(SP) - MOVO X10, 240(SP) - MOVO X11, 256(SP) + MOVO X8, 208(R10) + MOVO X9, 224(R10) + MOVO X10, 240(R10) + MOVO X11, 256(R10) HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) @@ -244,14 +242,14 @@ noinc: HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 16(SP), 32(SP), 48(SP), 64(SP), X11, X13, X14) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 16(R10), 32(R10), 48(R10), 64(R10), X11, X13, X14) SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 80(SP), 96(SP), 112(SP), 128(SP), X11, X13, X14) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 80(R10), 96(R10), 112(R10), 128(R10), X11, X13, X14) SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 144(SP), 160(SP), 176(SP), 192(SP), X11, X13, X14) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 144(R10), 160(R10), 176(R10), 192(R10), X11, X13, X14) SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 208(SP), 224(SP), 240(SP), 256(SP), X11, X13, X14) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 208(R10), 224(R10), 240(R10), 256(R10), X11, X13, X14) SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) MOVOU 32(AX), X10 @@ -277,5 +275,4 @@ noinc: MOVQ R8, 0(BX) MOVQ R9, 8(BX) - MOVQ BP, SP RET