diff --git a/_asm/transpose_amd64_asm.go b/_asm/transpose_amd64_asm.go index af6e3d0..fd567c5 100644 --- a/_asm/transpose_amd64_asm.go +++ b/_asm/transpose_amd64_asm.go @@ -149,12 +149,13 @@ func transpose128() { tmp := XMM() b := GP8() o := GP32() + cc := GP64() Comment("Initialize rr, current row") rr := zero() Label("row_loop") Comment("Initialize cc, current col") - cc := zero() + XORQ(cc, cc) Label("col_loop") Comment("Initialize (rr * ncols + cc) / 8, here ncols=128") @@ -203,6 +204,214 @@ func transpose128() { RET() } +func transpose128Rev() { + // transpose128Rev function + TEXT("transpose128Rev", NOSPLIT, "func(in, out *byte)") + Doc("Bit level matrix transpose, b0-b1-b2-b3, 128x128") + + in := Mem{Base: Load(Param("in"), GP64())} + out := Mem{Base: Load(Param("out"), GP64())} + + tmp := XMM() + b := GP8() + o := GP32() + + Comment("Initialize rr, current row, 96") + rr := zero() + cc := GP64() + addr := GP64() + + Label("row_loop_b3") + Comment("Initialize cc, current col") + XORQ(cc, cc) + Label("col_loop_b3") + Comment("Initialize (rr * ncols + cc) / 8, here ncols=128") + MOVQ(rr, addr) + ADDQ(Imm(96), addr) + Comment("Multiple with ncols") + SHLQ(Imm(7), addr) + ADDQ(cc, addr) + SHRQ(Imm(3), addr) + + Comment("Construct one XMM with first byte of first 16 rows") + for i := 0; i < 16; i++ { + MOVB(in.Idx(addr, 1), b) + PINSRB(Imm(uint64(i)), b.As32(), tmp) + Comment("Add ncols / 8") + ADDQ(Imm(16), addr) + } + + Comment("Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128") + MOVQ(cc, addr) + ADDQ(Imm(7), addr) + Comment("Multiple with nrows") + SHLQ(Imm(7), addr) + ADDQ(rr, addr) + SHRQ(Imm(3), addr) + + Comment("Get the most significant bit of each 8-bit element in the XMM, and store the returned 2 bytes") + for i := 7; i >= 0; i-- { + PMOVMSKB(tmp, o) + MOVW(o.As16(), out.Idx(addr, 1)) + PSLLQ(Imm(1), tmp) + Comment("Sub nrows / 8") + SUBQ(Imm(16), addr) + } + + Comment("Compare cc with ncols, here ncols=128") + ADDQ(Imm(8), cc) + CMPQ(cc, Imm(128)) + JL(LabelRef("col_loop_b3")) + + Comment("Compare rr with nrows, here nrows=128") + ADDQ(Imm(16), rr) + CMPQ(rr, U8(32)) + JL(LabelRef("row_loop_b3")) + + Label("row_loop_b2") + Comment("Initialize cc, current col") + XORQ(cc, cc) + Label("col_loop_b2") + Comment("Initialize (rr * ncols + cc) / 8, here ncols=128") + MOVQ(rr, addr) + ADDQ(Imm(32), addr) + Comment("Multiple with ncols") + SHLQ(Imm(7), addr) + ADDQ(cc, addr) + SHRQ(Imm(3), addr) + + Comment("Construct one XMM with first byte of first 16 rows") + for i := 0; i < 16; i++ { + MOVB(in.Idx(addr, 1), b) + PINSRB(Imm(uint64(i)), b.As32(), tmp) + Comment("Add ncols / 8") + ADDQ(Imm(16), addr) + } + + Comment("Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128") + MOVQ(cc, addr) + ADDQ(Imm(7), addr) + Comment("Multiple with nrows") + SHLQ(Imm(7), addr) + ADDQ(rr, addr) + SHRQ(Imm(3), addr) + + Comment("Get the most significant bit of each 8-bit element in the XMM, and store the returned 2 bytes") + for i := 7; i >= 0; i-- { + PMOVMSKB(tmp, o) + MOVW(o.As16(), out.Idx(addr, 1)) + PSLLQ(Imm(1), tmp) + Comment("Sub nrows / 8") + SUBQ(Imm(16), addr) + } + + Comment("Compare cc with ncols, here ncols=128") + ADDQ(Imm(8), cc) + CMPQ(cc, Imm(128)) + JL(LabelRef("col_loop_b2")) + + Comment("Compare rr with nrows, here nrows=128") + ADDQ(Imm(16), rr) + CMPQ(rr, U8(64)) + JL(LabelRef("row_loop_b2")) + + Label("row_loop_b1") + Comment("Initialize cc, current col") + XORQ(cc, cc) + Label("col_loop_b1") + Comment("Initialize (rr * ncols + cc) / 8, here ncols=128") + MOVQ(rr, addr) + SUBQ(Imm(32), addr) + Comment("Multiple with ncols") + SHLQ(Imm(7), addr) + ADDQ(cc, addr) + SHRQ(Imm(3), addr) + + Comment("Construct one XMM with first byte of first 16 rows") + for i := 0; i < 16; i++ { + MOVB(in.Idx(addr, 1), b) + PINSRB(Imm(uint64(i)), b.As32(), tmp) + Comment("Add ncols / 8") + ADDQ(Imm(16), addr) + } + + Comment("Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128") + MOVQ(cc, addr) + ADDQ(Imm(7), addr) + Comment("Multiple with nrows") + SHLQ(Imm(7), addr) + ADDQ(rr, addr) + SHRQ(Imm(3), addr) + + Comment("Get the most significant bit of each 8-bit element in the XMM, and store the returned 2 bytes") + for i := 7; i >= 0; i-- { + PMOVMSKB(tmp, o) + MOVW(o.As16(), out.Idx(addr, 1)) + PSLLQ(Imm(1), tmp) + Comment("Sub nrows / 8") + SUBQ(Imm(16), addr) + } + + Comment("Compare cc with ncols, here ncols=128") + ADDQ(Imm(8), cc) + CMPQ(cc, Imm(128)) + JL(LabelRef("col_loop_b1")) + + Comment("Compare rr with nrows, here nrows=128") + ADDQ(Imm(16), rr) + CMPQ(rr, U8(96)) + JL(LabelRef("row_loop_b1")) + + Label("row_loop_b0") + Comment("Initialize cc, current col") + XORQ(cc, cc) + Label("col_loop_b0") + Comment("Initialize (rr * ncols + cc) / 8, here ncols=128") + MOVQ(rr, addr) + SUBQ(Imm(96), addr) + Comment("Multiple with ncols") + SHLQ(Imm(7), addr) + ADDQ(cc, addr) + SHRQ(Imm(3), addr) + + Comment("Construct one XMM with first byte of first 16 rows") + for i := 0; i < 16; i++ { + MOVB(in.Idx(addr, 1), b) + PINSRB(Imm(uint64(i)), b.As32(), tmp) + Comment("Add ncols / 8") + ADDQ(Imm(16), addr) + } + + Comment("Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128") + MOVQ(cc, addr) + ADDQ(Imm(7), addr) + Comment("Multiple with nrows") + SHLQ(Imm(7), addr) + ADDQ(rr, addr) + SHRQ(Imm(3), addr) + + Comment("Get the most significant bit of each 8-bit element in the XMM, and store the returned 2 bytes") + for i := 7; i >= 0; i-- { + PMOVMSKB(tmp, o) + MOVW(o.As16(), out.Idx(addr, 1)) + PSLLQ(Imm(1), tmp) + Comment("Sub nrows / 8") + SUBQ(Imm(16), addr) + } + + Comment("Compare cc with ncols, here ncols=128") + ADDQ(Imm(8), cc) + CMPQ(cc, Imm(128)) + JL(LabelRef("col_loop_b0")) + + Comment("Compare rr with nrows, here nrows=128") + ADDQ(Imm(16), rr) + CMPQ(rr, U8(128)) + JL(LabelRef("row_loop_b0")) + + RET() +} + func xor32x128() { // xor32x128 function TEXT("xor32x128", NOSPLIT, "func(x, y, out *byte)") @@ -728,22 +937,21 @@ func sbox128() { PANDN(f, t8) // e9 Comment("e10=^(g1 & l1)") - MOVOU(buffer.Offset(1*16), t9) - PAND(t7, t9) - PANDN(f, t9) // e10 + MOVOU(buffer.Offset(1*16), t1) + PAND(t7, t1) + PANDN(f, t1) // e10 Comment("r6=e9 ^ e10") - PXOR(t9, t8) // r6 = e9 ^ e10 + PXOR(t1, t8) // r6 = e9 ^ e10 Comment("e11=^(g0 & l0)") MOVOU(buffer, t10) PAND(t11, t10) PANDN(f, t10) // e11 Comment("r7=e11 ^ e10") - PXOR(t10, t9) // r7 = e11 ^ e10 - Comment("store r6 r7") + PXOR(t10, t1) // r7 = e11 ^ e10 + Comment("store r6") MOVOU(t8, buffer.Offset(28*16)) - MOVOU(t9, buffer.Offset(29*16)) Comment("e12=^(m6 & k3)") MOVOU(buffer.Offset((8+6)*16), t7) // m6 @@ -787,13 +995,13 @@ func sbox128() { PXOR(t10, t9) // r11 = e17 ^ e16 = t9 Comment("start output function") + // t1 = r7 // t7 = r8 // t11 = r9 // t8 = r10 // t9 = r11 Comment("[t1]=r7 ^ r9") - MOVOU(buffer.Offset((22+7)*16), t1) // r7 - PXOR(t1, t11) // t11 = r7 ^ r9 + PXOR(t1, t11) // t11 = r7 ^ r9 Comment("t2=t1 ^ r1") MOVOU(buffer.Offset((22+1)*16), t2) // r1 PXOR(t11, t2) // t2 = r1 ^ t11 @@ -1166,6 +1374,7 @@ func main() { transpose64() transpose64Rev() transpose128() + transpose128Rev() xor32x128() xor32x128avx() xorRoundKey128() diff --git a/bs128.go b/bs128.go index e96ae6e..df8f452 100644 --- a/bs128.go +++ b/bs128.go @@ -56,11 +56,5 @@ func (bs bs128) EncryptBlocks(xk []uint32, dst, src []byte) { b2 = bs.xor32(b2, bs.l(bs.tao(bs.xorRK(xk[i*4+2], rk, b3, b0, b1), buffer), buffer)) b3 = bs.xor32(b3, bs.l(bs.tao(bs.xorRK(xk[i*4+3], rk, b0, b1, b2), buffer), buffer)) } - copy(rk, b0) - copy(state[:], b3) - copy(state[96*bitSize:], rk) - copy(rk, b1) - copy(state[32*bitSize:], b2) - copy(state[64*bitSize:], rk) - transpose128(&state[0], &dst[0]) + transpose128Rev(&state[0], &dst[0]) } diff --git a/bs128_test.go b/bs128_test.go index fb62a84..d978198 100644 --- a/bs128_test.go +++ b/bs128_test.go @@ -127,6 +127,25 @@ func BenchmarkL128(b *testing.B) { } } +func BenchmarkXorRK(b *testing.B) { + b0 := make([]byte, 32*BS128.bytes()) + b1 := make([]byte, 32*BS128.bytes()) + b2 := make([]byte, 32*BS128.bytes()) + rk := make([]byte, 32*BS128.bytes()) + k := uint32(0xa3b1bac6) + for i := 0; i < b.N; i++ { + BS128.xorRK(k, rk, b0, b1, b2) + } +} + +func BenchmarkXor32(b *testing.B) { + b0 := make([]byte, 32*BS128.bytes()) + b1 := make([]byte, 32*BS128.bytes()) + for i := 0; i < b.N; i++ { + BS128.xor32(b0, b1) + } +} + func TestBS128EncryptBlocks(t *testing.T) { bitSize := BS128.bytes() key := []byte{0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10} diff --git a/transpose128_amd64.go b/transpose128_amd64.go index a103d67..25eb90a 100644 --- a/transpose128_amd64.go +++ b/transpose128_amd64.go @@ -13,6 +13,9 @@ func transpose64Rev(in *byte, out *byte) // Bit level matrix transpose, 128x128 func transpose128(in *byte, out *byte) +// Bit level matrix transpose, b0-b1-b2-b3, 128x128 +func transpose128Rev(in *byte, out *byte) + // out = x xor y func xor32x128(x *byte, y *byte, out *byte) diff --git a/transpose128_amd64.s b/transpose128_amd64.s index d9e4e68..e8ea386 100644 --- a/transpose128_amd64.s +++ b/transpose128_amd64.s @@ -359,15 +359,520 @@ TEXT ·transpose128(SB), NOSPLIT, $0-16 MOVQ out+8(FP), CX // Initialize rr, current row + XORQ DI, DI + +row_loop: + // Initialize cc, current col XORQ SI, SI -row_loop: +col_loop: + // Initialize (rr * ncols + cc) / 8, here ncols=128 + MOVQ DI, R8 + + // Multiple with ncols + SHLQ $0x07, R8 + ADDQ SI, R8 + SHRQ $0x03, R8 + + // Construct one XMM with first byte of first 16 rows + MOVB (AX)(R8*1), DL + PINSRB $0x00, DX, X0 + + // Add ncols / 8 + ADDQ $0x10, R8 + MOVB (AX)(R8*1), DL + PINSRB $0x01, DX, X0 + + // Add ncols / 8 + ADDQ $0x10, R8 + MOVB (AX)(R8*1), DL + PINSRB $0x02, DX, X0 + + // Add ncols / 8 + ADDQ $0x10, R8 + MOVB (AX)(R8*1), DL + PINSRB $0x03, DX, X0 + + // Add ncols / 8 + ADDQ $0x10, R8 + MOVB (AX)(R8*1), DL + PINSRB $0x04, DX, X0 + + // Add ncols / 8 + ADDQ $0x10, R8 + MOVB (AX)(R8*1), DL + PINSRB $0x05, DX, X0 + + // Add ncols / 8 + ADDQ $0x10, R8 + MOVB (AX)(R8*1), DL + PINSRB $0x06, DX, X0 + + // Add ncols / 8 + ADDQ $0x10, R8 + MOVB (AX)(R8*1), DL + PINSRB $0x07, DX, X0 + + // Add ncols / 8 + ADDQ $0x10, R8 + MOVB (AX)(R8*1), DL + PINSRB $0x08, DX, X0 + + // Add ncols / 8 + ADDQ $0x10, R8 + MOVB (AX)(R8*1), DL + PINSRB $0x09, DX, X0 + + // Add ncols / 8 + ADDQ $0x10, R8 + MOVB (AX)(R8*1), DL + PINSRB $0x0a, DX, X0 + + // Add ncols / 8 + ADDQ $0x10, R8 + MOVB (AX)(R8*1), DL + PINSRB $0x0b, DX, X0 + + // Add ncols / 8 + ADDQ $0x10, R8 + MOVB (AX)(R8*1), DL + PINSRB $0x0c, DX, X0 + + // Add ncols / 8 + ADDQ $0x10, R8 + MOVB (AX)(R8*1), DL + PINSRB $0x0d, DX, X0 + + // Add ncols / 8 + ADDQ $0x10, R8 + MOVB (AX)(R8*1), DL + PINSRB $0x0e, DX, X0 + + // Add ncols / 8 + ADDQ $0x10, R8 + MOVB (AX)(R8*1), DL + PINSRB $0x0f, DX, X0 + + // Add ncols / 8 + ADDQ $0x10, R8 + + // Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128 + MOVQ SI, R8 + ADDQ $0x07, R8 + + // Multiple with nrows + SHLQ $0x07, R8 + ADDQ DI, R8 + SHRQ $0x03, R8 + + // Get the most significant bit of each 8-bit element in the XMM, and store the returned 2 bytes + PMOVMSKB X0, BX + MOVW BX, (CX)(R8*1) + PSLLQ $0x01, X0 + + // Sub nrows / 8 + SUBQ $0x10, R8 + PMOVMSKB X0, BX + MOVW BX, (CX)(R8*1) + PSLLQ $0x01, X0 + + // Sub nrows / 8 + SUBQ $0x10, R8 + PMOVMSKB X0, BX + MOVW BX, (CX)(R8*1) + PSLLQ $0x01, X0 + + // Sub nrows / 8 + SUBQ $0x10, R8 + PMOVMSKB X0, BX + MOVW BX, (CX)(R8*1) + PSLLQ $0x01, X0 + + // Sub nrows / 8 + SUBQ $0x10, R8 + PMOVMSKB X0, BX + MOVW BX, (CX)(R8*1) + PSLLQ $0x01, X0 + + // Sub nrows / 8 + SUBQ $0x10, R8 + PMOVMSKB X0, BX + MOVW BX, (CX)(R8*1) + PSLLQ $0x01, X0 + + // Sub nrows / 8 + SUBQ $0x10, R8 + PMOVMSKB X0, BX + MOVW BX, (CX)(R8*1) + PSLLQ $0x01, X0 + + // Sub nrows / 8 + SUBQ $0x10, R8 + PMOVMSKB X0, BX + MOVW BX, (CX)(R8*1) + PSLLQ $0x01, X0 + + // Sub nrows / 8 + SUBQ $0x10, R8 + + // Compare cc with ncols, here ncols=128 + ADDQ $0x08, SI + CMPQ SI, $0x80 + JL col_loop + + // Compare rr with nrows, here nrows=128 + ADDQ $0x10, DI + CMPQ DI, $0x80 + JL row_loop + RET + +// func transpose128Rev(in *byte, out *byte) +// Requires: SSE2, SSE4.1 +TEXT ·transpose128Rev(SB), NOSPLIT, $0-16 + MOVQ in+0(FP), AX + MOVQ out+8(FP), CX + + // Initialize rr, current row, 96 + XORQ SI, SI + +row_loop_b3: + // Initialize cc, current col + XORQ DI, DI + +col_loop_b3: + // Initialize (rr * ncols + cc) / 8, here ncols=128 + MOVQ SI, R8 + ADDQ $0x60, R8 + + // Multiple with ncols + SHLQ $0x07, R8 + ADDQ DI, R8 + SHRQ $0x03, R8 + + // Construct one XMM with first byte of first 16 rows + MOVB (AX)(R8*1), DL + PINSRB $0x00, DX, X0 + + // Add ncols / 8 + ADDQ $0x10, R8 + MOVB (AX)(R8*1), DL + PINSRB $0x01, DX, X0 + + // Add ncols / 8 + ADDQ $0x10, R8 + MOVB (AX)(R8*1), DL + PINSRB $0x02, DX, X0 + + // Add ncols / 8 + ADDQ $0x10, R8 + MOVB (AX)(R8*1), DL + PINSRB $0x03, DX, X0 + + // Add ncols / 8 + ADDQ $0x10, R8 + MOVB (AX)(R8*1), DL + PINSRB $0x04, DX, X0 + + // Add ncols / 8 + ADDQ $0x10, R8 + MOVB (AX)(R8*1), DL + PINSRB $0x05, DX, X0 + + // Add ncols / 8 + ADDQ $0x10, R8 + MOVB (AX)(R8*1), DL + PINSRB $0x06, DX, X0 + + // Add ncols / 8 + ADDQ $0x10, R8 + MOVB (AX)(R8*1), DL + PINSRB $0x07, DX, X0 + + // Add ncols / 8 + ADDQ $0x10, R8 + MOVB (AX)(R8*1), DL + PINSRB $0x08, DX, X0 + + // Add ncols / 8 + ADDQ $0x10, R8 + MOVB (AX)(R8*1), DL + PINSRB $0x09, DX, X0 + + // Add ncols / 8 + ADDQ $0x10, R8 + MOVB (AX)(R8*1), DL + PINSRB $0x0a, DX, X0 + + // Add ncols / 8 + ADDQ $0x10, R8 + MOVB (AX)(R8*1), DL + PINSRB $0x0b, DX, X0 + + // Add ncols / 8 + ADDQ $0x10, R8 + MOVB (AX)(R8*1), DL + PINSRB $0x0c, DX, X0 + + // Add ncols / 8 + ADDQ $0x10, R8 + MOVB (AX)(R8*1), DL + PINSRB $0x0d, DX, X0 + + // Add ncols / 8 + ADDQ $0x10, R8 + MOVB (AX)(R8*1), DL + PINSRB $0x0e, DX, X0 + + // Add ncols / 8 + ADDQ $0x10, R8 + MOVB (AX)(R8*1), DL + PINSRB $0x0f, DX, X0 + + // Add ncols / 8 + ADDQ $0x10, R8 + + // Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128 + MOVQ DI, R8 + ADDQ $0x07, R8 + + // Multiple with nrows + SHLQ $0x07, R8 + ADDQ SI, R8 + SHRQ $0x03, R8 + + // Get the most significant bit of each 8-bit element in the XMM, and store the returned 2 bytes + PMOVMSKB X0, BX + MOVW BX, (CX)(R8*1) + PSLLQ $0x01, X0 + + // Sub nrows / 8 + SUBQ $0x10, R8 + PMOVMSKB X0, BX + MOVW BX, (CX)(R8*1) + PSLLQ $0x01, X0 + + // Sub nrows / 8 + SUBQ $0x10, R8 + PMOVMSKB X0, BX + MOVW BX, (CX)(R8*1) + PSLLQ $0x01, X0 + + // Sub nrows / 8 + SUBQ $0x10, R8 + PMOVMSKB X0, BX + MOVW BX, (CX)(R8*1) + PSLLQ $0x01, X0 + + // Sub nrows / 8 + SUBQ $0x10, R8 + PMOVMSKB X0, BX + MOVW BX, (CX)(R8*1) + PSLLQ $0x01, X0 + + // Sub nrows / 8 + SUBQ $0x10, R8 + PMOVMSKB X0, BX + MOVW BX, (CX)(R8*1) + PSLLQ $0x01, X0 + + // Sub nrows / 8 + SUBQ $0x10, R8 + PMOVMSKB X0, BX + MOVW BX, (CX)(R8*1) + PSLLQ $0x01, X0 + + // Sub nrows / 8 + SUBQ $0x10, R8 + PMOVMSKB X0, BX + MOVW BX, (CX)(R8*1) + PSLLQ $0x01, X0 + + // Sub nrows / 8 + SUBQ $0x10, R8 + + // Compare cc with ncols, here ncols=128 + ADDQ $0x08, DI + CMPQ DI, $0x80 + JL col_loop_b3 + + // Compare rr with nrows, here nrows=128 + ADDQ $0x10, SI + CMPQ SI, $0x20 + JL row_loop_b3 + +row_loop_b2: + // Initialize cc, current col + XORQ DI, DI + +col_loop_b2: + // Initialize (rr * ncols + cc) / 8, here ncols=128 + MOVQ SI, R8 + ADDQ $0x20, R8 + + // Multiple with ncols + SHLQ $0x07, R8 + ADDQ DI, R8 + SHRQ $0x03, R8 + + // Construct one XMM with first byte of first 16 rows + MOVB (AX)(R8*1), DL + PINSRB $0x00, DX, X0 + + // Add ncols / 8 + ADDQ $0x10, R8 + MOVB (AX)(R8*1), DL + PINSRB $0x01, DX, X0 + + // Add ncols / 8 + ADDQ $0x10, R8 + MOVB (AX)(R8*1), DL + PINSRB $0x02, DX, X0 + + // Add ncols / 8 + ADDQ $0x10, R8 + MOVB (AX)(R8*1), DL + PINSRB $0x03, DX, X0 + + // Add ncols / 8 + ADDQ $0x10, R8 + MOVB (AX)(R8*1), DL + PINSRB $0x04, DX, X0 + + // Add ncols / 8 + ADDQ $0x10, R8 + MOVB (AX)(R8*1), DL + PINSRB $0x05, DX, X0 + + // Add ncols / 8 + ADDQ $0x10, R8 + MOVB (AX)(R8*1), DL + PINSRB $0x06, DX, X0 + + // Add ncols / 8 + ADDQ $0x10, R8 + MOVB (AX)(R8*1), DL + PINSRB $0x07, DX, X0 + + // Add ncols / 8 + ADDQ $0x10, R8 + MOVB (AX)(R8*1), DL + PINSRB $0x08, DX, X0 + + // Add ncols / 8 + ADDQ $0x10, R8 + MOVB (AX)(R8*1), DL + PINSRB $0x09, DX, X0 + + // Add ncols / 8 + ADDQ $0x10, R8 + MOVB (AX)(R8*1), DL + PINSRB $0x0a, DX, X0 + + // Add ncols / 8 + ADDQ $0x10, R8 + MOVB (AX)(R8*1), DL + PINSRB $0x0b, DX, X0 + + // Add ncols / 8 + ADDQ $0x10, R8 + MOVB (AX)(R8*1), DL + PINSRB $0x0c, DX, X0 + + // Add ncols / 8 + ADDQ $0x10, R8 + MOVB (AX)(R8*1), DL + PINSRB $0x0d, DX, X0 + + // Add ncols / 8 + ADDQ $0x10, R8 + MOVB (AX)(R8*1), DL + PINSRB $0x0e, DX, X0 + + // Add ncols / 8 + ADDQ $0x10, R8 + MOVB (AX)(R8*1), DL + PINSRB $0x0f, DX, X0 + + // Add ncols / 8 + ADDQ $0x10, R8 + + // Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128 + MOVQ DI, R8 + ADDQ $0x07, R8 + + // Multiple with nrows + SHLQ $0x07, R8 + ADDQ SI, R8 + SHRQ $0x03, R8 + + // Get the most significant bit of each 8-bit element in the XMM, and store the returned 2 bytes + PMOVMSKB X0, BX + MOVW BX, (CX)(R8*1) + PSLLQ $0x01, X0 + + // Sub nrows / 8 + SUBQ $0x10, R8 + PMOVMSKB X0, BX + MOVW BX, (CX)(R8*1) + PSLLQ $0x01, X0 + + // Sub nrows / 8 + SUBQ $0x10, R8 + PMOVMSKB X0, BX + MOVW BX, (CX)(R8*1) + PSLLQ $0x01, X0 + + // Sub nrows / 8 + SUBQ $0x10, R8 + PMOVMSKB X0, BX + MOVW BX, (CX)(R8*1) + PSLLQ $0x01, X0 + + // Sub nrows / 8 + SUBQ $0x10, R8 + PMOVMSKB X0, BX + MOVW BX, (CX)(R8*1) + PSLLQ $0x01, X0 + + // Sub nrows / 8 + SUBQ $0x10, R8 + PMOVMSKB X0, BX + MOVW BX, (CX)(R8*1) + PSLLQ $0x01, X0 + + // Sub nrows / 8 + SUBQ $0x10, R8 + PMOVMSKB X0, BX + MOVW BX, (CX)(R8*1) + PSLLQ $0x01, X0 + + // Sub nrows / 8 + SUBQ $0x10, R8 + PMOVMSKB X0, BX + MOVW BX, (CX)(R8*1) + PSLLQ $0x01, X0 + + // Sub nrows / 8 + SUBQ $0x10, R8 + + // Compare cc with ncols, here ncols=128 + ADDQ $0x08, DI + CMPQ DI, $0x80 + JL col_loop_b2 + + // Compare rr with nrows, here nrows=128 + ADDQ $0x10, SI + CMPQ SI, $0x40 + JL row_loop_b2 + +row_loop_b1: // Initialize cc, current col XORQ DI, DI -col_loop: +col_loop_b1: // Initialize (rr * ncols + cc) / 8, here ncols=128 MOVQ SI, R8 + SUBQ $0x20, R8 // Multiple with ncols SHLQ $0x07, R8 @@ -518,12 +1023,177 @@ col_loop: // Compare cc with ncols, here ncols=128 ADDQ $0x08, DI CMPQ DI, $0x80 - JL col_loop + JL col_loop_b1 + + // Compare rr with nrows, here nrows=128 + ADDQ $0x10, SI + CMPQ SI, $0x60 + JL row_loop_b1 + +row_loop_b0: + // Initialize cc, current col + XORQ DI, DI + +col_loop_b0: + // Initialize (rr * ncols + cc) / 8, here ncols=128 + MOVQ SI, R8 + SUBQ $0x60, R8 + + // Multiple with ncols + SHLQ $0x07, R8 + ADDQ DI, R8 + SHRQ $0x03, R8 + + // Construct one XMM with first byte of first 16 rows + MOVB (AX)(R8*1), DL + PINSRB $0x00, DX, X0 + + // Add ncols / 8 + ADDQ $0x10, R8 + MOVB (AX)(R8*1), DL + PINSRB $0x01, DX, X0 + + // Add ncols / 8 + ADDQ $0x10, R8 + MOVB (AX)(R8*1), DL + PINSRB $0x02, DX, X0 + + // Add ncols / 8 + ADDQ $0x10, R8 + MOVB (AX)(R8*1), DL + PINSRB $0x03, DX, X0 + + // Add ncols / 8 + ADDQ $0x10, R8 + MOVB (AX)(R8*1), DL + PINSRB $0x04, DX, X0 + + // Add ncols / 8 + ADDQ $0x10, R8 + MOVB (AX)(R8*1), DL + PINSRB $0x05, DX, X0 + + // Add ncols / 8 + ADDQ $0x10, R8 + MOVB (AX)(R8*1), DL + PINSRB $0x06, DX, X0 + + // Add ncols / 8 + ADDQ $0x10, R8 + MOVB (AX)(R8*1), DL + PINSRB $0x07, DX, X0 + + // Add ncols / 8 + ADDQ $0x10, R8 + MOVB (AX)(R8*1), DL + PINSRB $0x08, DX, X0 + + // Add ncols / 8 + ADDQ $0x10, R8 + MOVB (AX)(R8*1), DL + PINSRB $0x09, DX, X0 + + // Add ncols / 8 + ADDQ $0x10, R8 + MOVB (AX)(R8*1), DL + PINSRB $0x0a, DX, X0 + + // Add ncols / 8 + ADDQ $0x10, R8 + MOVB (AX)(R8*1), DL + PINSRB $0x0b, DX, X0 + + // Add ncols / 8 + ADDQ $0x10, R8 + MOVB (AX)(R8*1), DL + PINSRB $0x0c, DX, X0 + + // Add ncols / 8 + ADDQ $0x10, R8 + MOVB (AX)(R8*1), DL + PINSRB $0x0d, DX, X0 + + // Add ncols / 8 + ADDQ $0x10, R8 + MOVB (AX)(R8*1), DL + PINSRB $0x0e, DX, X0 + + // Add ncols / 8 + ADDQ $0x10, R8 + MOVB (AX)(R8*1), DL + PINSRB $0x0f, DX, X0 + + // Add ncols / 8 + ADDQ $0x10, R8 + + // Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128 + MOVQ DI, R8 + ADDQ $0x07, R8 + + // Multiple with nrows + SHLQ $0x07, R8 + ADDQ SI, R8 + SHRQ $0x03, R8 + + // Get the most significant bit of each 8-bit element in the XMM, and store the returned 2 bytes + PMOVMSKB X0, BX + MOVW BX, (CX)(R8*1) + PSLLQ $0x01, X0 + + // Sub nrows / 8 + SUBQ $0x10, R8 + PMOVMSKB X0, BX + MOVW BX, (CX)(R8*1) + PSLLQ $0x01, X0 + + // Sub nrows / 8 + SUBQ $0x10, R8 + PMOVMSKB X0, BX + MOVW BX, (CX)(R8*1) + PSLLQ $0x01, X0 + + // Sub nrows / 8 + SUBQ $0x10, R8 + PMOVMSKB X0, BX + MOVW BX, (CX)(R8*1) + PSLLQ $0x01, X0 + + // Sub nrows / 8 + SUBQ $0x10, R8 + PMOVMSKB X0, BX + MOVW BX, (CX)(R8*1) + PSLLQ $0x01, X0 + + // Sub nrows / 8 + SUBQ $0x10, R8 + PMOVMSKB X0, BX + MOVW BX, (CX)(R8*1) + PSLLQ $0x01, X0 + + // Sub nrows / 8 + SUBQ $0x10, R8 + PMOVMSKB X0, BX + MOVW BX, (CX)(R8*1) + PSLLQ $0x01, X0 + + // Sub nrows / 8 + SUBQ $0x10, R8 + PMOVMSKB X0, BX + MOVW BX, (CX)(R8*1) + PSLLQ $0x01, X0 + + // Sub nrows / 8 + SUBQ $0x10, R8 + + // Compare cc with ncols, here ncols=128 + ADDQ $0x08, DI + CMPQ DI, $0x80 + JL col_loop_b0 // Compare rr with nrows, here nrows=128 ADDQ $0x10, SI CMPQ SI, $0x80 - JL row_loop + JL row_loop_b0 RET // func xor32x128(x *byte, y *byte, out *byte) @@ -1045,12 +1715,12 @@ TEXT ·sbox128(SB), NOSPLIT, $0-16 PANDN X0, X4 // e10=^(g1 & l1) - MOVOU 16(CX), X11 - PAND X3, X11 - PANDN X0, X11 + MOVOU 16(CX), X1 + PAND X3, X1 + PANDN X0, X1 // r6=e9 ^ e10 - PXOR X11, X4 + PXOR X1, X4 // e11=^(g0 & l0) MOVOU (CX), X12 @@ -1058,11 +1728,10 @@ TEXT ·sbox128(SB), NOSPLIT, $0-16 PANDN X0, X12 // r7=e11 ^ e10 - PXOR X12, X11 + PXOR X12, X1 - // store r6 r7 + // store r6 MOVOU X4, 448(CX) - MOVOU X11, 464(CX) // e12=^(m6 & k3) MOVOU 224(CX), X3 @@ -1108,8 +1777,7 @@ TEXT ·sbox128(SB), NOSPLIT, $0-16 // start output function // [t1]=r7 ^ r9 - MOVOU 464(CX), X1 - PXOR X1, X9 + PXOR X1, X9 // t2=t1 ^ r1 MOVOU 368(CX), X2 diff --git a/transpose128_amd64_test.go b/transpose128_amd64_test.go index 66330d8..1bb0d49 100644 --- a/transpose128_amd64_test.go +++ b/transpose128_amd64_test.go @@ -70,3 +70,18 @@ func BenchmarkBS128Transpose(b *testing.B) { transpose128(&input[0], &out[0]) } } + +func BenchmarkBS128TransposeRev(b *testing.B) { + key := []byte{0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10} + input := make([]byte, 128*16) + for i := 0; i < 128; i++ { + copy(input[i*16:], key) + } + out := make([]byte, 128*16) + + b.ReportAllocs() + b.ResetTimer() + for i := 0; i < b.N; i++ { + transpose128Rev(&input[0], &out[0]) + } +}