From 6b8cda0e6d23436a167fe20a956b5a44771f4d56 Mon Sep 17 00:00:00 2001 From: Egor Chesakov Date: Fri, 13 Mar 2020 17:11:30 -0700 Subject: [PATCH] [Arm64] Vector Load/Store structure instructions (#33461) This adds support in the JIT emitter for Vector Load/Store structure instructions (C3.2.10 - Arm Architecture Reference Manual): - LD1 (1-4 registers) - LD2 - LD3 - LD4 - LD1R - LD2R - LD3R - LD4R - ST1 (1-4 registers) - ST2 - ST3 - ST4 in the following addressing modes: - Base register only - Post-indexed by a 64-bit register - Post-indexed by an immediate, equal to the number of bytes transferred Also adds support in JitDump for printing of * A SIMD vector register list. For example, ld1 {v5.16b, v6.16b, v7.16b, v8.16b}, [x9] * A SIMD vector element list. For example, st1 {v0.b}[3], [x1],#1 --- src/coreclr/src/jit/codegenarm64.cpp | 720 ++++++++++++++++++ src/coreclr/src/jit/emit.h | 2 + src/coreclr/src/jit/emitarm64.cpp | 1038 +++++++++++++++++++++----- src/coreclr/src/jit/emitarm64.h | 10 +- src/coreclr/src/jit/emitfmtsarm64.h | 16 +- src/coreclr/src/jit/instrsarm64.h | 122 ++- 6 files changed, 1727 insertions(+), 181 deletions(-) diff --git a/src/coreclr/src/jit/codegenarm64.cpp b/src/coreclr/src/jit/codegenarm64.cpp index da26c13433e4a..f227fc364107a 100644 --- a/src/coreclr/src/jit/codegenarm64.cpp +++ b/src/coreclr/src/jit/codegenarm64.cpp @@ -5219,6 +5219,726 @@ void CodeGen::genArm64EmitterUnitTests() #endif // ALL_ARM64_EMITTER_UNIT_TESTS +#ifdef ALL_ARM64_EMITTER_UNIT_TESTS + // + // Loads to /Stores from one, two, three, or four SIMD&FP registers + // + + genDefineTempLabel(genCreateTempLabel()); + + // ld1 {Vt}, [Xn|SP] + theEmitter->emitIns_R_R(INS_ld1, EA_8BYTE, REG_V0, REG_R1, INS_OPTS_8B); + theEmitter->emitIns_R_R(INS_ld1, EA_16BYTE, REG_V2, REG_R3, INS_OPTS_16B); + theEmitter->emitIns_R_R(INS_ld1, EA_8BYTE, REG_V4, REG_R5, INS_OPTS_4H); + theEmitter->emitIns_R_R(INS_ld1, EA_16BYTE, REG_V6, REG_R7, INS_OPTS_8H); + theEmitter->emitIns_R_R(INS_ld1, EA_8BYTE, REG_V8, REG_R9, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_ld1, EA_16BYTE, REG_V10, REG_R11, INS_OPTS_4S); + theEmitter->emitIns_R_R(INS_ld1, EA_8BYTE, REG_V12, REG_R13, INS_OPTS_1D); + theEmitter->emitIns_R_R(INS_ld1, EA_16BYTE, REG_V14, REG_R15, INS_OPTS_2D); + + // ld1 {Vt, Vt2}, [Xn|SP] + theEmitter->emitIns_R_R(INS_ld1_2regs, EA_8BYTE, REG_V0, REG_R2, INS_OPTS_8B); + theEmitter->emitIns_R_R(INS_ld1_2regs, EA_16BYTE, REG_V3, REG_R5, INS_OPTS_16B); + theEmitter->emitIns_R_R(INS_ld1_2regs, EA_8BYTE, REG_V6, REG_R8, INS_OPTS_4H); + theEmitter->emitIns_R_R(INS_ld1_2regs, EA_16BYTE, REG_V9, REG_R11, INS_OPTS_8H); + theEmitter->emitIns_R_R(INS_ld1_2regs, EA_8BYTE, REG_V12, REG_R14, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_ld1_2regs, EA_16BYTE, REG_V15, REG_R17, INS_OPTS_4S); + theEmitter->emitIns_R_R(INS_ld1_2regs, EA_8BYTE, REG_V18, REG_R20, INS_OPTS_1D); + theEmitter->emitIns_R_R(INS_ld1_2regs, EA_16BYTE, REG_V21, REG_R23, INS_OPTS_2D); + + // ld1 {Vt, Vt2, Vt3}, [Xn|SP] + theEmitter->emitIns_R_R(INS_ld1_3regs, EA_8BYTE, REG_V0, REG_R3, INS_OPTS_8B); + theEmitter->emitIns_R_R(INS_ld1_3regs, EA_16BYTE, REG_V4, REG_R7, INS_OPTS_16B); + theEmitter->emitIns_R_R(INS_ld1_3regs, EA_8BYTE, REG_V8, REG_R11, INS_OPTS_4H); + theEmitter->emitIns_R_R(INS_ld1_3regs, EA_16BYTE, REG_V12, REG_R15, INS_OPTS_8H); + theEmitter->emitIns_R_R(INS_ld1_3regs, EA_8BYTE, REG_V16, REG_R19, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_ld1_3regs, EA_16BYTE, REG_V20, REG_R23, INS_OPTS_4S); + theEmitter->emitIns_R_R(INS_ld1_3regs, EA_8BYTE, REG_V24, REG_R27, INS_OPTS_1D); + theEmitter->emitIns_R_R(INS_ld1_3regs, EA_16BYTE, REG_V28, REG_SP, INS_OPTS_2D); + + // ld1 {Vt, Vt2, Vt3, Vt4}, [Xn|SP] + theEmitter->emitIns_R_R(INS_ld1_4regs, EA_8BYTE, REG_V0, REG_R4, INS_OPTS_8B); + theEmitter->emitIns_R_R(INS_ld1_4regs, EA_16BYTE, REG_V5, REG_R9, INS_OPTS_16B); + theEmitter->emitIns_R_R(INS_ld1_4regs, EA_8BYTE, REG_V10, REG_R14, INS_OPTS_4H); + theEmitter->emitIns_R_R(INS_ld1_4regs, EA_16BYTE, REG_V15, REG_R19, INS_OPTS_8H); + theEmitter->emitIns_R_R(INS_ld1_4regs, EA_8BYTE, REG_V20, REG_R24, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_ld1_4regs, EA_16BYTE, REG_V25, REG_R29, INS_OPTS_4S); + theEmitter->emitIns_R_R(INS_ld1_4regs, EA_8BYTE, REG_V30, REG_R2, INS_OPTS_1D); + theEmitter->emitIns_R_R(INS_ld1_4regs, EA_16BYTE, REG_V3, REG_R7, INS_OPTS_2D); + + // ld2 {Vt, Vt2}, [Xn|SP] + theEmitter->emitIns_R_R(INS_ld2, EA_8BYTE, REG_V0, REG_R2, INS_OPTS_8B); + theEmitter->emitIns_R_R(INS_ld2, EA_16BYTE, REG_V3, REG_R5, INS_OPTS_16B); + theEmitter->emitIns_R_R(INS_ld2, EA_8BYTE, REG_V6, REG_R8, INS_OPTS_4H); + theEmitter->emitIns_R_R(INS_ld2, EA_16BYTE, REG_V9, REG_R11, INS_OPTS_8H); + theEmitter->emitIns_R_R(INS_ld2, EA_8BYTE, REG_V12, REG_R14, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_ld2, EA_16BYTE, REG_V15, REG_R17, INS_OPTS_4S); + theEmitter->emitIns_R_R(INS_ld2, EA_16BYTE, REG_V18, REG_R20, INS_OPTS_2D); + + // ld3 {Vt, Vt2, Vt3}, [Xn|SP] + theEmitter->emitIns_R_R(INS_ld3, EA_8BYTE, REG_V0, REG_R3, INS_OPTS_8B); + theEmitter->emitIns_R_R(INS_ld3, EA_16BYTE, REG_V4, REG_R7, INS_OPTS_16B); + theEmitter->emitIns_R_R(INS_ld3, EA_8BYTE, REG_V8, REG_R11, INS_OPTS_4H); + theEmitter->emitIns_R_R(INS_ld3, EA_16BYTE, REG_V12, REG_R15, INS_OPTS_8H); + theEmitter->emitIns_R_R(INS_ld3, EA_8BYTE, REG_V16, REG_R19, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_ld3, EA_16BYTE, REG_V20, REG_R23, INS_OPTS_4S); + theEmitter->emitIns_R_R(INS_ld3, EA_16BYTE, REG_V24, REG_R27, INS_OPTS_2D); + + // ld4 {Vt, Vt2, Vt3, Vt4}, [Xn|SP] + theEmitter->emitIns_R_R(INS_ld4, EA_8BYTE, REG_V0, REG_R4, INS_OPTS_8B); + theEmitter->emitIns_R_R(INS_ld4, EA_16BYTE, REG_V5, REG_R9, INS_OPTS_16B); + theEmitter->emitIns_R_R(INS_ld4, EA_8BYTE, REG_V10, REG_R14, INS_OPTS_4H); + theEmitter->emitIns_R_R(INS_ld4, EA_16BYTE, REG_V15, REG_R19, INS_OPTS_8H); + theEmitter->emitIns_R_R(INS_ld4, EA_8BYTE, REG_V20, REG_R24, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_ld4, EA_16BYTE, REG_V25, REG_R29, INS_OPTS_4S); + theEmitter->emitIns_R_R(INS_ld4, EA_16BYTE, REG_V30, REG_R2, INS_OPTS_2D); + + // st1 {Vt}, [Xn|SP] + theEmitter->emitIns_R_R(INS_st1, EA_8BYTE, REG_V0, REG_R1, INS_OPTS_8B); + theEmitter->emitIns_R_R(INS_st1, EA_16BYTE, REG_V2, REG_R3, INS_OPTS_16B); + theEmitter->emitIns_R_R(INS_st1, EA_8BYTE, REG_V4, REG_R5, INS_OPTS_4H); + theEmitter->emitIns_R_R(INS_st1, EA_16BYTE, REG_V6, REG_R7, INS_OPTS_8H); + theEmitter->emitIns_R_R(INS_st1, EA_8BYTE, REG_V8, REG_R9, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_st1, EA_16BYTE, REG_V10, REG_R11, INS_OPTS_4S); + theEmitter->emitIns_R_R(INS_st1, EA_8BYTE, REG_V12, REG_R13, INS_OPTS_1D); + theEmitter->emitIns_R_R(INS_st1, EA_16BYTE, REG_V14, REG_R15, INS_OPTS_2D); + + // st1 {Vt, Vt2}, [Xn|SP] + theEmitter->emitIns_R_R(INS_st1_2regs, EA_8BYTE, REG_V0, REG_R2, INS_OPTS_8B); + theEmitter->emitIns_R_R(INS_st1_2regs, EA_16BYTE, REG_V3, REG_R5, INS_OPTS_16B); + theEmitter->emitIns_R_R(INS_st1_2regs, EA_8BYTE, REG_V6, REG_R8, INS_OPTS_4H); + theEmitter->emitIns_R_R(INS_st1_2regs, EA_16BYTE, REG_V9, REG_R11, INS_OPTS_8H); + theEmitter->emitIns_R_R(INS_st1_2regs, EA_8BYTE, REG_V12, REG_R14, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_st1_2regs, EA_16BYTE, REG_V15, REG_R17, INS_OPTS_4S); + theEmitter->emitIns_R_R(INS_st1_2regs, EA_8BYTE, REG_V18, REG_R20, INS_OPTS_1D); + theEmitter->emitIns_R_R(INS_st1_2regs, EA_16BYTE, REG_V21, REG_R23, INS_OPTS_2D); + + // st1 {Vt, Vt2, Vt3}, [Xn|SP] + theEmitter->emitIns_R_R(INS_st1_3regs, EA_8BYTE, REG_V0, REG_R3, INS_OPTS_8B); + theEmitter->emitIns_R_R(INS_st1_3regs, EA_16BYTE, REG_V4, REG_R7, INS_OPTS_16B); + theEmitter->emitIns_R_R(INS_st1_3regs, EA_8BYTE, REG_V8, REG_R11, INS_OPTS_4H); + theEmitter->emitIns_R_R(INS_st1_3regs, EA_16BYTE, REG_V12, REG_R15, INS_OPTS_8H); + theEmitter->emitIns_R_R(INS_st1_3regs, EA_8BYTE, REG_V16, REG_R19, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_st1_3regs, EA_16BYTE, REG_V20, REG_R23, INS_OPTS_4S); + theEmitter->emitIns_R_R(INS_st1_3regs, EA_8BYTE, REG_V24, REG_R27, INS_OPTS_1D); + theEmitter->emitIns_R_R(INS_st1_3regs, EA_16BYTE, REG_V28, REG_SP, INS_OPTS_2D); + + // st1 {Vt, Vt2, Vt3, Vt4}, [Xn|SP] + theEmitter->emitIns_R_R(INS_st1_4regs, EA_8BYTE, REG_V0, REG_R4, INS_OPTS_8B); + theEmitter->emitIns_R_R(INS_st1_4regs, EA_16BYTE, REG_V5, REG_R9, INS_OPTS_16B); + theEmitter->emitIns_R_R(INS_st1_4regs, EA_8BYTE, REG_V10, REG_R14, INS_OPTS_4H); + theEmitter->emitIns_R_R(INS_st1_4regs, EA_16BYTE, REG_V15, REG_R19, INS_OPTS_8H); + theEmitter->emitIns_R_R(INS_st1_4regs, EA_8BYTE, REG_V20, REG_R24, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_st1_4regs, EA_16BYTE, REG_V25, REG_R29, INS_OPTS_4S); + theEmitter->emitIns_R_R(INS_st1_4regs, EA_8BYTE, REG_V30, REG_R2, INS_OPTS_1D); + theEmitter->emitIns_R_R(INS_st1_4regs, EA_16BYTE, REG_V3, REG_R7, INS_OPTS_2D); + + // st2 {Vt, Vt2}, [Xn|SP] + theEmitter->emitIns_R_R(INS_st2, EA_8BYTE, REG_V0, REG_R2, INS_OPTS_8B); + theEmitter->emitIns_R_R(INS_st2, EA_16BYTE, REG_V3, REG_R5, INS_OPTS_16B); + theEmitter->emitIns_R_R(INS_st2, EA_8BYTE, REG_V6, REG_R8, INS_OPTS_4H); + theEmitter->emitIns_R_R(INS_st2, EA_16BYTE, REG_V9, REG_R11, INS_OPTS_8H); + theEmitter->emitIns_R_R(INS_st2, EA_8BYTE, REG_V12, REG_R14, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_st2, EA_16BYTE, REG_V15, REG_R17, INS_OPTS_4S); + theEmitter->emitIns_R_R(INS_st2, EA_16BYTE, REG_V18, REG_R20, INS_OPTS_2D); + + // st3 {Vt, Vt2, Vt3}, [Xn|SP] + theEmitter->emitIns_R_R(INS_st3, EA_8BYTE, REG_V0, REG_R3, INS_OPTS_8B); + theEmitter->emitIns_R_R(INS_st3, EA_16BYTE, REG_V4, REG_R7, INS_OPTS_16B); + theEmitter->emitIns_R_R(INS_st3, EA_8BYTE, REG_V8, REG_R11, INS_OPTS_4H); + theEmitter->emitIns_R_R(INS_st3, EA_16BYTE, REG_V12, REG_R15, INS_OPTS_8H); + theEmitter->emitIns_R_R(INS_st3, EA_8BYTE, REG_V16, REG_R19, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_st3, EA_16BYTE, REG_V20, REG_R23, INS_OPTS_4S); + theEmitter->emitIns_R_R(INS_st3, EA_16BYTE, REG_V24, REG_R27, INS_OPTS_2D); + + // st4 {Vt, Vt2, Vt3, Vt4}, [Xn|SP] + theEmitter->emitIns_R_R(INS_st4, EA_8BYTE, REG_V0, REG_R4, INS_OPTS_8B); + theEmitter->emitIns_R_R(INS_st4, EA_16BYTE, REG_V5, REG_R9, INS_OPTS_16B); + theEmitter->emitIns_R_R(INS_st4, EA_8BYTE, REG_V10, REG_R14, INS_OPTS_4H); + theEmitter->emitIns_R_R(INS_st4, EA_16BYTE, REG_V15, REG_R19, INS_OPTS_8H); + theEmitter->emitIns_R_R(INS_st4, EA_8BYTE, REG_V20, REG_R24, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_st4, EA_16BYTE, REG_V25, REG_R29, INS_OPTS_4S); + theEmitter->emitIns_R_R(INS_st4, EA_16BYTE, REG_V30, REG_R2, INS_OPTS_2D); + + // ld1r {Vt}, [Xn|SP] + theEmitter->emitIns_R_R(INS_ld1r, EA_8BYTE, REG_V0, REG_R1, INS_OPTS_8B); + theEmitter->emitIns_R_R(INS_ld1r, EA_16BYTE, REG_V2, REG_R3, INS_OPTS_16B); + theEmitter->emitIns_R_R(INS_ld1r, EA_8BYTE, REG_V4, REG_R5, INS_OPTS_4H); + theEmitter->emitIns_R_R(INS_ld1r, EA_16BYTE, REG_V6, REG_R7, INS_OPTS_8H); + theEmitter->emitIns_R_R(INS_ld1r, EA_8BYTE, REG_V8, REG_R9, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_ld1r, EA_16BYTE, REG_V10, REG_R11, INS_OPTS_4S); + theEmitter->emitIns_R_R(INS_ld1r, EA_8BYTE, REG_V12, REG_R13, INS_OPTS_1D); + theEmitter->emitIns_R_R(INS_ld1r, EA_16BYTE, REG_V14, REG_R15, INS_OPTS_2D); + + // ld2r {Vt, Vt2}, [Xn|SP] + theEmitter->emitIns_R_R(INS_ld2r, EA_8BYTE, REG_V0, REG_R2, INS_OPTS_8B); + theEmitter->emitIns_R_R(INS_ld2r, EA_16BYTE, REG_V3, REG_R5, INS_OPTS_16B); + theEmitter->emitIns_R_R(INS_ld2r, EA_8BYTE, REG_V6, REG_R8, INS_OPTS_4H); + theEmitter->emitIns_R_R(INS_ld2r, EA_16BYTE, REG_V9, REG_R11, INS_OPTS_8H); + theEmitter->emitIns_R_R(INS_ld2r, EA_8BYTE, REG_V12, REG_R14, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_ld2r, EA_16BYTE, REG_V15, REG_R17, INS_OPTS_4S); + theEmitter->emitIns_R_R(INS_ld2r, EA_8BYTE, REG_V18, REG_R20, INS_OPTS_1D); + theEmitter->emitIns_R_R(INS_ld2r, EA_16BYTE, REG_V21, REG_R23, INS_OPTS_2D); + + // ld3r {Vt, Vt2, Vt3}, [Xn|SP] + theEmitter->emitIns_R_R(INS_ld3r, EA_8BYTE, REG_V0, REG_R3, INS_OPTS_8B); + theEmitter->emitIns_R_R(INS_ld3r, EA_16BYTE, REG_V4, REG_R7, INS_OPTS_16B); + theEmitter->emitIns_R_R(INS_ld3r, EA_8BYTE, REG_V8, REG_R11, INS_OPTS_4H); + theEmitter->emitIns_R_R(INS_ld3r, EA_16BYTE, REG_V12, REG_R15, INS_OPTS_8H); + theEmitter->emitIns_R_R(INS_ld3r, EA_8BYTE, REG_V16, REG_R19, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_ld3r, EA_16BYTE, REG_V20, REG_R23, INS_OPTS_4S); + theEmitter->emitIns_R_R(INS_ld3r, EA_8BYTE, REG_V24, REG_R27, INS_OPTS_1D); + theEmitter->emitIns_R_R(INS_ld3r, EA_16BYTE, REG_V28, REG_SP, INS_OPTS_2D); + + // ld4r {Vt, Vt2, Vt3, Vt4}, [Xn|SP] + theEmitter->emitIns_R_R(INS_ld4r, EA_8BYTE, REG_V0, REG_R4, INS_OPTS_8B); + theEmitter->emitIns_R_R(INS_ld4r, EA_16BYTE, REG_V5, REG_R9, INS_OPTS_16B); + theEmitter->emitIns_R_R(INS_ld4r, EA_8BYTE, REG_V10, REG_R14, INS_OPTS_4H); + theEmitter->emitIns_R_R(INS_ld4r, EA_16BYTE, REG_V15, REG_R19, INS_OPTS_8H); + theEmitter->emitIns_R_R(INS_ld4r, EA_8BYTE, REG_V20, REG_R24, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_ld4r, EA_16BYTE, REG_V25, REG_R29, INS_OPTS_4S); + theEmitter->emitIns_R_R(INS_ld4r, EA_8BYTE, REG_V30, REG_R2, INS_OPTS_1D); + theEmitter->emitIns_R_R(INS_ld4r, EA_16BYTE, REG_V3, REG_R7, INS_OPTS_2D); + +#endif // ALL_ARM64_EMITTER_UNIT_TESTS + +#ifdef ALL_ARM64_EMITTER_UNIT_TESTS + // + // Loads to /Stores from one, two, three, or four SIMD&FP registers + // + + genDefineTempLabel(genCreateTempLabel()); + + // ld1 {Vt}, [Xn|SP], Xm + theEmitter->emitIns_R_R_R(INS_ld1, EA_8BYTE, REG_V0, REG_R1, REG_R2, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_ld1, EA_16BYTE, REG_V3, REG_R4, REG_R5, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_ld1, EA_8BYTE, REG_V6, REG_R7, REG_R8, INS_OPTS_4H); + theEmitter->emitIns_R_R_R(INS_ld1, EA_16BYTE, REG_V9, REG_R10, REG_R11, INS_OPTS_8H); + theEmitter->emitIns_R_R_R(INS_ld1, EA_8BYTE, REG_V12, REG_R13, REG_R14, INS_OPTS_2S); + theEmitter->emitIns_R_R_R(INS_ld1, EA_16BYTE, REG_V15, REG_R16, REG_R17, INS_OPTS_4S); + theEmitter->emitIns_R_R_R(INS_ld1, EA_8BYTE, REG_V18, REG_R19, REG_R20, INS_OPTS_1D); + theEmitter->emitIns_R_R_R(INS_ld1, EA_16BYTE, REG_V21, REG_R22, REG_R23, INS_OPTS_2D); + + // ld1 {Vt, Vt2}, [Xn|SP], Xm + theEmitter->emitIns_R_R_R(INS_ld1_2regs, EA_8BYTE, REG_V0, REG_R2, REG_R3, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_ld1_2regs, EA_16BYTE, REG_V4, REG_R6, REG_R7, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_ld1_2regs, EA_8BYTE, REG_V8, REG_R10, REG_R11, INS_OPTS_4H); + theEmitter->emitIns_R_R_R(INS_ld1_2regs, EA_16BYTE, REG_V12, REG_R14, REG_R15, INS_OPTS_8H); + theEmitter->emitIns_R_R_R(INS_ld1_2regs, EA_8BYTE, REG_V16, REG_R18, REG_R19, INS_OPTS_2S); + theEmitter->emitIns_R_R_R(INS_ld1_2regs, EA_16BYTE, REG_V20, REG_R22, REG_R23, INS_OPTS_4S); + theEmitter->emitIns_R_R_R(INS_ld1_2regs, EA_8BYTE, REG_V24, REG_R26, REG_R27, INS_OPTS_1D); + theEmitter->emitIns_R_R_R(INS_ld1_2regs, EA_16BYTE, REG_V28, REG_SP, REG_R30, INS_OPTS_2D); + + // ld1 {Vt, Vt2, Vt3}, [Xn|SP], Xm + theEmitter->emitIns_R_R_R(INS_ld1_3regs, EA_8BYTE, REG_V0, REG_R3, REG_R4, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_ld1_3regs, EA_16BYTE, REG_V5, REG_R8, REG_R9, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_ld1_3regs, EA_8BYTE, REG_V10, REG_R13, REG_R14, INS_OPTS_4H); + theEmitter->emitIns_R_R_R(INS_ld1_3regs, EA_16BYTE, REG_V15, REG_R18, REG_R19, INS_OPTS_8H); + theEmitter->emitIns_R_R_R(INS_ld1_3regs, EA_8BYTE, REG_V20, REG_R23, REG_R24, INS_OPTS_2S); + theEmitter->emitIns_R_R_R(INS_ld1_3regs, EA_16BYTE, REG_V25, REG_R28, REG_R29, INS_OPTS_4S); + theEmitter->emitIns_R_R_R(INS_ld1_3regs, EA_8BYTE, REG_V30, REG_R0, REG_R1, INS_OPTS_1D); + theEmitter->emitIns_R_R_R(INS_ld1_3regs, EA_16BYTE, REG_V2, REG_R5, REG_R6, INS_OPTS_2D); + + // ld1 {Vt, Vt2, Vt3, Vt4}, [Xn|SP], Xm + theEmitter->emitIns_R_R_R(INS_ld1_4regs, EA_8BYTE, REG_V0, REG_R4, REG_R5, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_ld1_4regs, EA_16BYTE, REG_V6, REG_R10, REG_R11, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_ld1_4regs, EA_8BYTE, REG_V12, REG_R16, REG_R17, INS_OPTS_4H); + theEmitter->emitIns_R_R_R(INS_ld1_4regs, EA_16BYTE, REG_V18, REG_R22, REG_R23, INS_OPTS_8H); + theEmitter->emitIns_R_R_R(INS_ld1_4regs, EA_8BYTE, REG_V24, REG_R28, REG_R29, INS_OPTS_2S); + theEmitter->emitIns_R_R_R(INS_ld1_4regs, EA_16BYTE, REG_V30, REG_R2, REG_R3, INS_OPTS_4S); + theEmitter->emitIns_R_R_R(INS_ld1_4regs, EA_8BYTE, REG_V4, REG_R8, REG_R9, INS_OPTS_1D); + theEmitter->emitIns_R_R_R(INS_ld1_4regs, EA_16BYTE, REG_V10, REG_R14, REG_R15, INS_OPTS_2D); + + // ld2 {Vt, Vt2}, [Xn|SP], Xm + theEmitter->emitIns_R_R_R(INS_ld2, EA_8BYTE, REG_V0, REG_R2, REG_R3, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_ld2, EA_16BYTE, REG_V4, REG_R6, REG_R7, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_ld2, EA_8BYTE, REG_V8, REG_R10, REG_R11, INS_OPTS_4H); + theEmitter->emitIns_R_R_R(INS_ld2, EA_16BYTE, REG_V12, REG_R14, REG_R15, INS_OPTS_8H); + theEmitter->emitIns_R_R_R(INS_ld2, EA_8BYTE, REG_V16, REG_R18, REG_R19, INS_OPTS_2S); + theEmitter->emitIns_R_R_R(INS_ld2, EA_16BYTE, REG_V20, REG_R22, REG_R23, INS_OPTS_4S); + theEmitter->emitIns_R_R_R(INS_ld2, EA_16BYTE, REG_V24, REG_R26, REG_R27, INS_OPTS_2D); + + // ld3 {Vt, Vt2, Vt3}, [Xn|SP], Xm + theEmitter->emitIns_R_R_R(INS_ld3, EA_8BYTE, REG_V0, REG_R3, REG_R4, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_ld3, EA_16BYTE, REG_V5, REG_R8, REG_R9, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_ld3, EA_8BYTE, REG_V10, REG_R13, REG_R14, INS_OPTS_4H); + theEmitter->emitIns_R_R_R(INS_ld3, EA_16BYTE, REG_V15, REG_R18, REG_R19, INS_OPTS_8H); + theEmitter->emitIns_R_R_R(INS_ld3, EA_8BYTE, REG_V20, REG_R23, REG_R24, INS_OPTS_2S); + theEmitter->emitIns_R_R_R(INS_ld3, EA_16BYTE, REG_V25, REG_R28, REG_R29, INS_OPTS_4S); + theEmitter->emitIns_R_R_R(INS_ld3, EA_16BYTE, REG_V30, REG_R0, REG_R1, INS_OPTS_2D); + + // ld4 {Vt, Vt2, Vt3, Vt4}, [Xn|SP], Xm + theEmitter->emitIns_R_R_R(INS_ld4, EA_8BYTE, REG_V0, REG_R4, REG_R5, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_ld4, EA_16BYTE, REG_V6, REG_R10, REG_R11, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_ld4, EA_8BYTE, REG_V12, REG_R16, REG_R17, INS_OPTS_4H); + theEmitter->emitIns_R_R_R(INS_ld4, EA_16BYTE, REG_V18, REG_R22, REG_R23, INS_OPTS_8H); + theEmitter->emitIns_R_R_R(INS_ld4, EA_8BYTE, REG_V24, REG_R28, REG_R29, INS_OPTS_2S); + theEmitter->emitIns_R_R_R(INS_ld4, EA_16BYTE, REG_V30, REG_R2, REG_R3, INS_OPTS_4S); + theEmitter->emitIns_R_R_R(INS_ld4, EA_16BYTE, REG_V4, REG_R8, REG_R9, INS_OPTS_2D); + + // st1 {Vt}, [Xn|SP], Xm + theEmitter->emitIns_R_R_R(INS_st1, EA_8BYTE, REG_V0, REG_R1, REG_R2, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_st1, EA_16BYTE, REG_V3, REG_R4, REG_R5, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_st1, EA_8BYTE, REG_V6, REG_R7, REG_R8, INS_OPTS_4H); + theEmitter->emitIns_R_R_R(INS_st1, EA_16BYTE, REG_V9, REG_R10, REG_R11, INS_OPTS_8H); + theEmitter->emitIns_R_R_R(INS_st1, EA_8BYTE, REG_V12, REG_R13, REG_R14, INS_OPTS_2S); + theEmitter->emitIns_R_R_R(INS_st1, EA_16BYTE, REG_V15, REG_R16, REG_R17, INS_OPTS_4S); + theEmitter->emitIns_R_R_R(INS_st1, EA_8BYTE, REG_V18, REG_R19, REG_R20, INS_OPTS_1D); + theEmitter->emitIns_R_R_R(INS_st1, EA_16BYTE, REG_V21, REG_R22, REG_R23, INS_OPTS_2D); + + // st1 {Vt, Vt2}, [Xn|SP], Xm + theEmitter->emitIns_R_R_R(INS_st1_2regs, EA_8BYTE, REG_V0, REG_R2, REG_R3, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_st1_2regs, EA_16BYTE, REG_V4, REG_R6, REG_R7, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_st1_2regs, EA_8BYTE, REG_V8, REG_R10, REG_R11, INS_OPTS_4H); + theEmitter->emitIns_R_R_R(INS_st1_2regs, EA_16BYTE, REG_V12, REG_R14, REG_R15, INS_OPTS_8H); + theEmitter->emitIns_R_R_R(INS_st1_2regs, EA_8BYTE, REG_V16, REG_R18, REG_R19, INS_OPTS_2S); + theEmitter->emitIns_R_R_R(INS_st1_2regs, EA_16BYTE, REG_V20, REG_R22, REG_R23, INS_OPTS_4S); + theEmitter->emitIns_R_R_R(INS_st1_2regs, EA_8BYTE, REG_V24, REG_R26, REG_R27, INS_OPTS_1D); + theEmitter->emitIns_R_R_R(INS_st1_2regs, EA_16BYTE, REG_V28, REG_SP, REG_R30, INS_OPTS_2D); + + // st1 {Vt, Vt2, Vt3}, [Xn|SP], Xm + theEmitter->emitIns_R_R_R(INS_st1_3regs, EA_8BYTE, REG_V0, REG_R3, REG_R4, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_st1_3regs, EA_16BYTE, REG_V5, REG_R8, REG_R9, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_st1_3regs, EA_8BYTE, REG_V10, REG_R13, REG_R14, INS_OPTS_4H); + theEmitter->emitIns_R_R_R(INS_st1_3regs, EA_16BYTE, REG_V15, REG_R18, REG_R19, INS_OPTS_8H); + theEmitter->emitIns_R_R_R(INS_st1_3regs, EA_8BYTE, REG_V20, REG_R23, REG_R24, INS_OPTS_2S); + theEmitter->emitIns_R_R_R(INS_st1_3regs, EA_16BYTE, REG_V25, REG_R28, REG_R29, INS_OPTS_4S); + theEmitter->emitIns_R_R_R(INS_st1_3regs, EA_8BYTE, REG_V30, REG_R0, REG_R1, INS_OPTS_1D); + theEmitter->emitIns_R_R_R(INS_st1_3regs, EA_16BYTE, REG_V2, REG_R5, REG_R6, INS_OPTS_2D); + + // st1 {Vt, Vt2, Vt3, Vt4}, [Xn|SP], Xm + theEmitter->emitIns_R_R_R(INS_st1_4regs, EA_8BYTE, REG_V0, REG_R4, REG_R5, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_st1_4regs, EA_16BYTE, REG_V6, REG_R10, REG_R11, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_st1_4regs, EA_8BYTE, REG_V12, REG_R16, REG_R17, INS_OPTS_4H); + theEmitter->emitIns_R_R_R(INS_st1_4regs, EA_16BYTE, REG_V18, REG_R22, REG_R23, INS_OPTS_8H); + theEmitter->emitIns_R_R_R(INS_st1_4regs, EA_8BYTE, REG_V24, REG_R28, REG_R29, INS_OPTS_2S); + theEmitter->emitIns_R_R_R(INS_st1_4regs, EA_16BYTE, REG_V30, REG_R2, REG_R3, INS_OPTS_4S); + theEmitter->emitIns_R_R_R(INS_st1_4regs, EA_8BYTE, REG_V4, REG_R8, REG_R9, INS_OPTS_1D); + theEmitter->emitIns_R_R_R(INS_st1_4regs, EA_16BYTE, REG_V10, REG_R14, REG_R15, INS_OPTS_2D); + + // st2 {Vt, Vt2}, [Xn|SP], Xm + theEmitter->emitIns_R_R_R(INS_st2, EA_8BYTE, REG_V0, REG_R2, REG_R3, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_st2, EA_16BYTE, REG_V4, REG_R6, REG_R7, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_st2, EA_8BYTE, REG_V8, REG_R10, REG_R11, INS_OPTS_4H); + theEmitter->emitIns_R_R_R(INS_st2, EA_16BYTE, REG_V12, REG_R14, REG_R15, INS_OPTS_8H); + theEmitter->emitIns_R_R_R(INS_st2, EA_8BYTE, REG_V16, REG_R18, REG_R19, INS_OPTS_2S); + theEmitter->emitIns_R_R_R(INS_st2, EA_16BYTE, REG_V20, REG_R22, REG_R23, INS_OPTS_4S); + theEmitter->emitIns_R_R_R(INS_st2, EA_16BYTE, REG_V24, REG_R26, REG_R27, INS_OPTS_2D); + + // st3 {Vt, Vt2, Vt3}, [Xn|SP], Xm + theEmitter->emitIns_R_R_R(INS_st3, EA_8BYTE, REG_V0, REG_R3, REG_R4, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_st3, EA_16BYTE, REG_V5, REG_R8, REG_R9, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_st3, EA_8BYTE, REG_V10, REG_R13, REG_R14, INS_OPTS_4H); + theEmitter->emitIns_R_R_R(INS_st3, EA_16BYTE, REG_V15, REG_R18, REG_R19, INS_OPTS_8H); + theEmitter->emitIns_R_R_R(INS_st3, EA_8BYTE, REG_V20, REG_R23, REG_R24, INS_OPTS_2S); + theEmitter->emitIns_R_R_R(INS_st3, EA_16BYTE, REG_V25, REG_R28, REG_R29, INS_OPTS_4S); + theEmitter->emitIns_R_R_R(INS_st3, EA_16BYTE, REG_V30, REG_R0, REG_R1, INS_OPTS_2D); + + // st4 {Vt, Vt2, Vt3, Vt4}, [Xn|SP], Xm + theEmitter->emitIns_R_R_R(INS_st4, EA_8BYTE, REG_V0, REG_R4, REG_R5, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_st4, EA_16BYTE, REG_V6, REG_R10, REG_R11, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_st4, EA_8BYTE, REG_V12, REG_R16, REG_R17, INS_OPTS_4H); + theEmitter->emitIns_R_R_R(INS_st4, EA_16BYTE, REG_V18, REG_R22, REG_R23, INS_OPTS_8H); + theEmitter->emitIns_R_R_R(INS_st4, EA_8BYTE, REG_V24, REG_R28, REG_R29, INS_OPTS_2S); + theEmitter->emitIns_R_R_R(INS_st4, EA_16BYTE, REG_V30, REG_R2, REG_R3, INS_OPTS_4S); + theEmitter->emitIns_R_R_R(INS_st4, EA_16BYTE, REG_V4, REG_R8, REG_R9, INS_OPTS_2D); + + // ld1r {Vt}, [Xn|SP], Xm + theEmitter->emitIns_R_R_R(INS_ld1r, EA_8BYTE, REG_V0, REG_R1, REG_R2, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_ld1r, EA_16BYTE, REG_V3, REG_R4, REG_R5, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_ld1r, EA_8BYTE, REG_V6, REG_R7, REG_R8, INS_OPTS_4H); + theEmitter->emitIns_R_R_R(INS_ld1r, EA_16BYTE, REG_V9, REG_R10, REG_R11, INS_OPTS_8H); + theEmitter->emitIns_R_R_R(INS_ld1r, EA_8BYTE, REG_V12, REG_R13, REG_R14, INS_OPTS_2S); + theEmitter->emitIns_R_R_R(INS_ld1r, EA_16BYTE, REG_V15, REG_R16, REG_R17, INS_OPTS_4S); + theEmitter->emitIns_R_R_R(INS_ld1r, EA_8BYTE, REG_V18, REG_R19, REG_R20, INS_OPTS_1D); + theEmitter->emitIns_R_R_R(INS_ld1r, EA_16BYTE, REG_V21, REG_R22, REG_R23, INS_OPTS_2D); + + // ld2r {Vt, Vt2}, [Xn|SP], Xm + theEmitter->emitIns_R_R_R(INS_ld2r, EA_8BYTE, REG_V0, REG_R2, REG_R3, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_ld2r, EA_16BYTE, REG_V4, REG_R6, REG_R7, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_ld2r, EA_8BYTE, REG_V8, REG_R10, REG_R11, INS_OPTS_4H); + theEmitter->emitIns_R_R_R(INS_ld2r, EA_16BYTE, REG_V12, REG_R14, REG_R15, INS_OPTS_8H); + theEmitter->emitIns_R_R_R(INS_ld2r, EA_8BYTE, REG_V16, REG_R18, REG_R19, INS_OPTS_2S); + theEmitter->emitIns_R_R_R(INS_ld2r, EA_16BYTE, REG_V20, REG_R22, REG_R23, INS_OPTS_4S); + theEmitter->emitIns_R_R_R(INS_ld2r, EA_8BYTE, REG_V24, REG_R26, REG_R27, INS_OPTS_1D); + theEmitter->emitIns_R_R_R(INS_ld2r, EA_16BYTE, REG_V28, REG_SP, REG_R30, INS_OPTS_2D); + + // ld3r {Vt, Vt2, Vt3}, [Xn|SP], Xm + theEmitter->emitIns_R_R_R(INS_ld3r, EA_8BYTE, REG_V0, REG_R3, REG_R4, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_ld3r, EA_16BYTE, REG_V5, REG_R8, REG_R9, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_ld3r, EA_8BYTE, REG_V10, REG_R13, REG_R14, INS_OPTS_4H); + theEmitter->emitIns_R_R_R(INS_ld3r, EA_16BYTE, REG_V15, REG_R18, REG_R19, INS_OPTS_8H); + theEmitter->emitIns_R_R_R(INS_ld3r, EA_8BYTE, REG_V20, REG_R23, REG_R24, INS_OPTS_2S); + theEmitter->emitIns_R_R_R(INS_ld3r, EA_16BYTE, REG_V25, REG_R28, REG_R29, INS_OPTS_4S); + theEmitter->emitIns_R_R_R(INS_ld3r, EA_8BYTE, REG_V30, REG_R0, REG_R1, INS_OPTS_1D); + theEmitter->emitIns_R_R_R(INS_ld3r, EA_16BYTE, REG_V2, REG_R5, REG_R6, INS_OPTS_2D); + + // ld4r {Vt, Vt2, Vt3, Vt4}, [Xn|SP], Xm + theEmitter->emitIns_R_R_R(INS_ld4r, EA_8BYTE, REG_V0, REG_R4, REG_R5, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_ld4r, EA_16BYTE, REG_V6, REG_R10, REG_R11, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_ld4r, EA_8BYTE, REG_V12, REG_R16, REG_R17, INS_OPTS_4H); + theEmitter->emitIns_R_R_R(INS_ld4r, EA_16BYTE, REG_V18, REG_R22, REG_R23, INS_OPTS_8H); + theEmitter->emitIns_R_R_R(INS_ld4r, EA_8BYTE, REG_V24, REG_R28, REG_R29, INS_OPTS_2S); + theEmitter->emitIns_R_R_R(INS_ld4r, EA_16BYTE, REG_V30, REG_R2, REG_R3, INS_OPTS_4S); + theEmitter->emitIns_R_R_R(INS_ld4r, EA_8BYTE, REG_V4, REG_R8, REG_R9, INS_OPTS_1D); + theEmitter->emitIns_R_R_R(INS_ld4r, EA_16BYTE, REG_V10, REG_R14, REG_R15, INS_OPTS_2D); + +#endif // ALL_ARM64_EMITTER_UNIT_TESTS + +#ifdef ALL_ARM64_EMITTER_UNIT_TESTS + // + // Loads to /Stores from one, two, three, or four SIMD&FP registers + // + + genDefineTempLabel(genCreateTempLabel()); + + // ld1 {Vt}, [Xn|SP], #imm + theEmitter->emitIns_R_R_I(INS_ld1, EA_8BYTE, REG_V0, REG_R1, 8, INS_OPTS_8B); + theEmitter->emitIns_R_R_I(INS_ld1, EA_16BYTE, REG_V2, REG_R3, 16, INS_OPTS_16B); + theEmitter->emitIns_R_R_I(INS_ld1, EA_8BYTE, REG_V4, REG_R5, 8, INS_OPTS_4H); + theEmitter->emitIns_R_R_I(INS_ld1, EA_16BYTE, REG_V6, REG_R7, 16, INS_OPTS_8H); + theEmitter->emitIns_R_R_I(INS_ld1, EA_8BYTE, REG_V8, REG_R9, 8, INS_OPTS_2S); + theEmitter->emitIns_R_R_I(INS_ld1, EA_16BYTE, REG_V10, REG_R11, 16, INS_OPTS_4S); + theEmitter->emitIns_R_R_I(INS_ld1, EA_8BYTE, REG_V12, REG_R13, 8, INS_OPTS_1D); + theEmitter->emitIns_R_R_I(INS_ld1, EA_16BYTE, REG_V14, REG_R15, 16, INS_OPTS_2D); + + // ld1 {Vt, Vt2}, [Xn|SP], #imm + theEmitter->emitIns_R_R_I(INS_ld1_2regs, EA_8BYTE, REG_V0, REG_R2, 16, INS_OPTS_8B); + theEmitter->emitIns_R_R_I(INS_ld1_2regs, EA_16BYTE, REG_V3, REG_R5, 32, INS_OPTS_16B); + theEmitter->emitIns_R_R_I(INS_ld1_2regs, EA_8BYTE, REG_V6, REG_R8, 16, INS_OPTS_4H); + theEmitter->emitIns_R_R_I(INS_ld1_2regs, EA_16BYTE, REG_V9, REG_R11, 32, INS_OPTS_8H); + theEmitter->emitIns_R_R_I(INS_ld1_2regs, EA_8BYTE, REG_V12, REG_R14, 16, INS_OPTS_2S); + theEmitter->emitIns_R_R_I(INS_ld1_2regs, EA_16BYTE, REG_V15, REG_R17, 32, INS_OPTS_4S); + theEmitter->emitIns_R_R_I(INS_ld1_2regs, EA_8BYTE, REG_V18, REG_R20, 16, INS_OPTS_1D); + theEmitter->emitIns_R_R_I(INS_ld1_2regs, EA_16BYTE, REG_V21, REG_R23, 32, INS_OPTS_2D); + + // ld1 {Vt, Vt2, Vt3}, [Xn|SP], #imm + theEmitter->emitIns_R_R_I(INS_ld1_3regs, EA_8BYTE, REG_V0, REG_R3, 24, INS_OPTS_8B); + theEmitter->emitIns_R_R_I(INS_ld1_3regs, EA_16BYTE, REG_V4, REG_R7, 48, INS_OPTS_16B); + theEmitter->emitIns_R_R_I(INS_ld1_3regs, EA_8BYTE, REG_V8, REG_R11, 24, INS_OPTS_4H); + theEmitter->emitIns_R_R_I(INS_ld1_3regs, EA_16BYTE, REG_V12, REG_R15, 48, INS_OPTS_8H); + theEmitter->emitIns_R_R_I(INS_ld1_3regs, EA_8BYTE, REG_V16, REG_R19, 24, INS_OPTS_2S); + theEmitter->emitIns_R_R_I(INS_ld1_3regs, EA_16BYTE, REG_V20, REG_R23, 48, INS_OPTS_4S); + theEmitter->emitIns_R_R_I(INS_ld1_3regs, EA_8BYTE, REG_V24, REG_R27, 24, INS_OPTS_1D); + theEmitter->emitIns_R_R_I(INS_ld1_3regs, EA_16BYTE, REG_V28, REG_SP, 48, INS_OPTS_2D); + + // ld1 {Vt, Vt2, Vt3, Vt4}, [Xn|SP], #imm + theEmitter->emitIns_R_R_I(INS_ld1_4regs, EA_8BYTE, REG_V0, REG_R4, 32, INS_OPTS_8B); + theEmitter->emitIns_R_R_I(INS_ld1_4regs, EA_16BYTE, REG_V5, REG_R9, 64, INS_OPTS_16B); + theEmitter->emitIns_R_R_I(INS_ld1_4regs, EA_8BYTE, REG_V10, REG_R14, 32, INS_OPTS_4H); + theEmitter->emitIns_R_R_I(INS_ld1_4regs, EA_16BYTE, REG_V15, REG_R19, 64, INS_OPTS_8H); + theEmitter->emitIns_R_R_I(INS_ld1_4regs, EA_8BYTE, REG_V20, REG_R24, 32, INS_OPTS_2S); + theEmitter->emitIns_R_R_I(INS_ld1_4regs, EA_16BYTE, REG_V25, REG_R29, 64, INS_OPTS_4S); + theEmitter->emitIns_R_R_I(INS_ld1_4regs, EA_8BYTE, REG_V30, REG_R2, 32, INS_OPTS_1D); + theEmitter->emitIns_R_R_I(INS_ld1_4regs, EA_16BYTE, REG_V3, REG_R7, 64, INS_OPTS_2D); + + // ld2 {Vt, Vt2}, [Xn|SP], #imm + theEmitter->emitIns_R_R_I(INS_ld2, EA_8BYTE, REG_V0, REG_R2, 16, INS_OPTS_8B); + theEmitter->emitIns_R_R_I(INS_ld2, EA_16BYTE, REG_V3, REG_R5, 32, INS_OPTS_16B); + theEmitter->emitIns_R_R_I(INS_ld2, EA_8BYTE, REG_V6, REG_R8, 16, INS_OPTS_4H); + theEmitter->emitIns_R_R_I(INS_ld2, EA_16BYTE, REG_V9, REG_R11, 32, INS_OPTS_8H); + theEmitter->emitIns_R_R_I(INS_ld2, EA_8BYTE, REG_V12, REG_R14, 16, INS_OPTS_2S); + theEmitter->emitIns_R_R_I(INS_ld2, EA_16BYTE, REG_V15, REG_R17, 32, INS_OPTS_4S); + theEmitter->emitIns_R_R_I(INS_ld2, EA_16BYTE, REG_V18, REG_R20, 32, INS_OPTS_2D); + + // ld3 {Vt, Vt2, Vt3}, [Xn|SP], #imm + theEmitter->emitIns_R_R_I(INS_ld3, EA_8BYTE, REG_V0, REG_R3, 24, INS_OPTS_8B); + theEmitter->emitIns_R_R_I(INS_ld3, EA_16BYTE, REG_V4, REG_R7, 48, INS_OPTS_16B); + theEmitter->emitIns_R_R_I(INS_ld3, EA_8BYTE, REG_V8, REG_R11, 24, INS_OPTS_4H); + theEmitter->emitIns_R_R_I(INS_ld3, EA_16BYTE, REG_V12, REG_R15, 48, INS_OPTS_8H); + theEmitter->emitIns_R_R_I(INS_ld3, EA_8BYTE, REG_V16, REG_R19, 24, INS_OPTS_2S); + theEmitter->emitIns_R_R_I(INS_ld3, EA_16BYTE, REG_V20, REG_R23, 48, INS_OPTS_4S); + theEmitter->emitIns_R_R_I(INS_ld3, EA_16BYTE, REG_V24, REG_R27, 48, INS_OPTS_2D); + + // ld4 {Vt, Vt2, Vt3, Vt4}, [Xn|SP], #imm + theEmitter->emitIns_R_R_I(INS_ld4, EA_8BYTE, REG_V0, REG_R4, 32, INS_OPTS_8B); + theEmitter->emitIns_R_R_I(INS_ld4, EA_16BYTE, REG_V5, REG_R9, 64, INS_OPTS_16B); + theEmitter->emitIns_R_R_I(INS_ld4, EA_8BYTE, REG_V10, REG_R14, 32, INS_OPTS_4H); + theEmitter->emitIns_R_R_I(INS_ld4, EA_16BYTE, REG_V15, REG_R19, 64, INS_OPTS_8H); + theEmitter->emitIns_R_R_I(INS_ld4, EA_8BYTE, REG_V20, REG_R24, 32, INS_OPTS_2S); + theEmitter->emitIns_R_R_I(INS_ld4, EA_16BYTE, REG_V25, REG_R29, 64, INS_OPTS_4S); + theEmitter->emitIns_R_R_I(INS_ld4, EA_16BYTE, REG_V30, REG_R2, 64, INS_OPTS_2D); + + // st1 {Vt}, [Xn|SP], #imm + theEmitter->emitIns_R_R_I(INS_st1, EA_8BYTE, REG_V0, REG_R1, 8, INS_OPTS_8B); + theEmitter->emitIns_R_R_I(INS_st1, EA_16BYTE, REG_V2, REG_R3, 16, INS_OPTS_16B); + theEmitter->emitIns_R_R_I(INS_st1, EA_8BYTE, REG_V4, REG_R5, 8, INS_OPTS_4H); + theEmitter->emitIns_R_R_I(INS_st1, EA_16BYTE, REG_V6, REG_R7, 16, INS_OPTS_8H); + theEmitter->emitIns_R_R_I(INS_st1, EA_8BYTE, REG_V8, REG_R9, 8, INS_OPTS_2S); + theEmitter->emitIns_R_R_I(INS_st1, EA_16BYTE, REG_V10, REG_R11, 16, INS_OPTS_4S); + theEmitter->emitIns_R_R_I(INS_st1, EA_8BYTE, REG_V12, REG_R13, 8, INS_OPTS_1D); + theEmitter->emitIns_R_R_I(INS_st1, EA_16BYTE, REG_V14, REG_R15, 16, INS_OPTS_2D); + + // st1 {Vt, Vt2}, [Xn|SP], #imm + theEmitter->emitIns_R_R_I(INS_st1_2regs, EA_8BYTE, REG_V0, REG_R2, 16, INS_OPTS_8B); + theEmitter->emitIns_R_R_I(INS_st1_2regs, EA_16BYTE, REG_V3, REG_R5, 32, INS_OPTS_16B); + theEmitter->emitIns_R_R_I(INS_st1_2regs, EA_8BYTE, REG_V6, REG_R8, 16, INS_OPTS_4H); + theEmitter->emitIns_R_R_I(INS_st1_2regs, EA_16BYTE, REG_V9, REG_R11, 32, INS_OPTS_8H); + theEmitter->emitIns_R_R_I(INS_st1_2regs, EA_8BYTE, REG_V12, REG_R14, 16, INS_OPTS_2S); + theEmitter->emitIns_R_R_I(INS_st1_2regs, EA_16BYTE, REG_V15, REG_R17, 32, INS_OPTS_4S); + theEmitter->emitIns_R_R_I(INS_st1_2regs, EA_8BYTE, REG_V18, REG_R20, 16, INS_OPTS_1D); + theEmitter->emitIns_R_R_I(INS_st1_2regs, EA_16BYTE, REG_V21, REG_R23, 32, INS_OPTS_2D); + + // st1 {Vt, Vt2, Vt3}, [Xn|SP], #imm + theEmitter->emitIns_R_R_I(INS_st1_3regs, EA_8BYTE, REG_V0, REG_R3, 24, INS_OPTS_8B); + theEmitter->emitIns_R_R_I(INS_st1_3regs, EA_16BYTE, REG_V4, REG_R7, 48, INS_OPTS_16B); + theEmitter->emitIns_R_R_I(INS_st1_3regs, EA_8BYTE, REG_V8, REG_R11, 24, INS_OPTS_4H); + theEmitter->emitIns_R_R_I(INS_st1_3regs, EA_16BYTE, REG_V12, REG_R15, 48, INS_OPTS_8H); + theEmitter->emitIns_R_R_I(INS_st1_3regs, EA_8BYTE, REG_V16, REG_R19, 24, INS_OPTS_2S); + theEmitter->emitIns_R_R_I(INS_st1_3regs, EA_16BYTE, REG_V20, REG_R23, 48, INS_OPTS_4S); + theEmitter->emitIns_R_R_I(INS_st1_3regs, EA_8BYTE, REG_V24, REG_R27, 24, INS_OPTS_1D); + theEmitter->emitIns_R_R_I(INS_st1_3regs, EA_16BYTE, REG_V28, REG_SP, 48, INS_OPTS_2D); + + // st1 {Vt, Vt2, Vt3, Vt4}, [Xn|SP], #imm + theEmitter->emitIns_R_R_I(INS_st1_4regs, EA_8BYTE, REG_V0, REG_R4, 32, INS_OPTS_8B); + theEmitter->emitIns_R_R_I(INS_st1_4regs, EA_16BYTE, REG_V5, REG_R9, 64, INS_OPTS_16B); + theEmitter->emitIns_R_R_I(INS_st1_4regs, EA_8BYTE, REG_V10, REG_R14, 32, INS_OPTS_4H); + theEmitter->emitIns_R_R_I(INS_st1_4regs, EA_16BYTE, REG_V15, REG_R19, 64, INS_OPTS_8H); + theEmitter->emitIns_R_R_I(INS_st1_4regs, EA_8BYTE, REG_V20, REG_R24, 32, INS_OPTS_2S); + theEmitter->emitIns_R_R_I(INS_st1_4regs, EA_16BYTE, REG_V25, REG_R29, 64, INS_OPTS_4S); + theEmitter->emitIns_R_R_I(INS_st1_4regs, EA_8BYTE, REG_V30, REG_R2, 32, INS_OPTS_1D); + theEmitter->emitIns_R_R_I(INS_st1_4regs, EA_16BYTE, REG_V3, REG_R7, 64, INS_OPTS_2D); + + // st2 {Vt, Vt2}, [Xn|SP], #imm + theEmitter->emitIns_R_R_I(INS_st2, EA_8BYTE, REG_V0, REG_R2, 16, INS_OPTS_8B); + theEmitter->emitIns_R_R_I(INS_st2, EA_16BYTE, REG_V3, REG_R5, 32, INS_OPTS_16B); + theEmitter->emitIns_R_R_I(INS_st2, EA_8BYTE, REG_V6, REG_R8, 16, INS_OPTS_4H); + theEmitter->emitIns_R_R_I(INS_st2, EA_16BYTE, REG_V9, REG_R11, 32, INS_OPTS_8H); + theEmitter->emitIns_R_R_I(INS_st2, EA_8BYTE, REG_V12, REG_R14, 16, INS_OPTS_2S); + theEmitter->emitIns_R_R_I(INS_st2, EA_16BYTE, REG_V15, REG_R17, 32, INS_OPTS_4S); + theEmitter->emitIns_R_R_I(INS_st2, EA_16BYTE, REG_V18, REG_R20, 32, INS_OPTS_2D); + + // st3 {Vt, Vt2, Vt3}, [Xn|SP], #imm + theEmitter->emitIns_R_R_I(INS_st3, EA_8BYTE, REG_V0, REG_R3, 24, INS_OPTS_8B); + theEmitter->emitIns_R_R_I(INS_st3, EA_16BYTE, REG_V4, REG_R7, 48, INS_OPTS_16B); + theEmitter->emitIns_R_R_I(INS_st3, EA_8BYTE, REG_V8, REG_R11, 24, INS_OPTS_4H); + theEmitter->emitIns_R_R_I(INS_st3, EA_16BYTE, REG_V12, REG_R15, 48, INS_OPTS_8H); + theEmitter->emitIns_R_R_I(INS_st3, EA_8BYTE, REG_V16, REG_R19, 24, INS_OPTS_2S); + theEmitter->emitIns_R_R_I(INS_st3, EA_16BYTE, REG_V20, REG_R23, 48, INS_OPTS_4S); + theEmitter->emitIns_R_R_I(INS_st3, EA_16BYTE, REG_V24, REG_R27, 48, INS_OPTS_2D); + + // st4 {Vt, Vt2, Vt3, Vt4}, [Xn|SP], #imm + theEmitter->emitIns_R_R_I(INS_st4, EA_8BYTE, REG_V0, REG_R4, 32, INS_OPTS_8B); + theEmitter->emitIns_R_R_I(INS_st4, EA_16BYTE, REG_V5, REG_R9, 64, INS_OPTS_16B); + theEmitter->emitIns_R_R_I(INS_st4, EA_8BYTE, REG_V10, REG_R14, 32, INS_OPTS_4H); + theEmitter->emitIns_R_R_I(INS_st4, EA_16BYTE, REG_V15, REG_R19, 64, INS_OPTS_8H); + theEmitter->emitIns_R_R_I(INS_st4, EA_8BYTE, REG_V20, REG_R24, 32, INS_OPTS_2S); + theEmitter->emitIns_R_R_I(INS_st4, EA_16BYTE, REG_V25, REG_R29, 64, INS_OPTS_4S); + theEmitter->emitIns_R_R_I(INS_st4, EA_16BYTE, REG_V30, REG_R2, 64, INS_OPTS_2D); + + // ld1r {Vt}, [Xn|SP], #imm + theEmitter->emitIns_R_R_I(INS_ld1r, EA_8BYTE, REG_V0, REG_R1, 1, INS_OPTS_8B); + theEmitter->emitIns_R_R_I(INS_ld1r, EA_16BYTE, REG_V2, REG_R3, 1, INS_OPTS_16B); + theEmitter->emitIns_R_R_I(INS_ld1r, EA_8BYTE, REG_V4, REG_R5, 2, INS_OPTS_4H); + theEmitter->emitIns_R_R_I(INS_ld1r, EA_16BYTE, REG_V6, REG_R7, 2, INS_OPTS_8H); + theEmitter->emitIns_R_R_I(INS_ld1r, EA_8BYTE, REG_V8, REG_R9, 4, INS_OPTS_2S); + theEmitter->emitIns_R_R_I(INS_ld1r, EA_16BYTE, REG_V10, REG_R11, 4, INS_OPTS_4S); + theEmitter->emitIns_R_R_I(INS_ld1r, EA_8BYTE, REG_V12, REG_R13, 8, INS_OPTS_1D); + theEmitter->emitIns_R_R_I(INS_ld1r, EA_16BYTE, REG_V14, REG_R15, 8, INS_OPTS_2D); + + // ld2r {Vt, Vt2}, [Xn|SP], #imm + theEmitter->emitIns_R_R_I(INS_ld2r, EA_8BYTE, REG_V0, REG_R2, 2, INS_OPTS_8B); + theEmitter->emitIns_R_R_I(INS_ld2r, EA_16BYTE, REG_V3, REG_R5, 2, INS_OPTS_16B); + theEmitter->emitIns_R_R_I(INS_ld2r, EA_8BYTE, REG_V6, REG_R8, 4, INS_OPTS_4H); + theEmitter->emitIns_R_R_I(INS_ld2r, EA_16BYTE, REG_V9, REG_R11, 4, INS_OPTS_8H); + theEmitter->emitIns_R_R_I(INS_ld2r, EA_8BYTE, REG_V12, REG_R14, 8, INS_OPTS_2S); + theEmitter->emitIns_R_R_I(INS_ld2r, EA_16BYTE, REG_V15, REG_R17, 8, INS_OPTS_4S); + theEmitter->emitIns_R_R_I(INS_ld2r, EA_8BYTE, REG_V18, REG_R20, 16, INS_OPTS_1D); + theEmitter->emitIns_R_R_I(INS_ld2r, EA_16BYTE, REG_V21, REG_R23, 16, INS_OPTS_2D); + + // ld3r {Vt, Vt2, Vt3}, [Xn|SP], #imm + theEmitter->emitIns_R_R_I(INS_ld3r, EA_8BYTE, REG_V0, REG_R3, 3, INS_OPTS_8B); + theEmitter->emitIns_R_R_I(INS_ld3r, EA_16BYTE, REG_V4, REG_R7, 3, INS_OPTS_16B); + theEmitter->emitIns_R_R_I(INS_ld3r, EA_8BYTE, REG_V8, REG_R11, 6, INS_OPTS_4H); + theEmitter->emitIns_R_R_I(INS_ld3r, EA_16BYTE, REG_V12, REG_R15, 6, INS_OPTS_8H); + theEmitter->emitIns_R_R_I(INS_ld3r, EA_8BYTE, REG_V16, REG_R19, 12, INS_OPTS_2S); + theEmitter->emitIns_R_R_I(INS_ld3r, EA_16BYTE, REG_V20, REG_R23, 12, INS_OPTS_4S); + theEmitter->emitIns_R_R_I(INS_ld3r, EA_8BYTE, REG_V24, REG_R27, 24, INS_OPTS_1D); + theEmitter->emitIns_R_R_I(INS_ld3r, EA_16BYTE, REG_V28, REG_SP, 24, INS_OPTS_2D); + + // ld4r {Vt, Vt2, Vt3, Vt4}, [Xn|SP], #imm + theEmitter->emitIns_R_R_I(INS_ld4r, EA_8BYTE, REG_V0, REG_R4, 4, INS_OPTS_8B); + theEmitter->emitIns_R_R_I(INS_ld4r, EA_16BYTE, REG_V5, REG_R9, 4, INS_OPTS_16B); + theEmitter->emitIns_R_R_I(INS_ld4r, EA_8BYTE, REG_V10, REG_R14, 8, INS_OPTS_4H); + theEmitter->emitIns_R_R_I(INS_ld4r, EA_16BYTE, REG_V15, REG_R19, 8, INS_OPTS_8H); + theEmitter->emitIns_R_R_I(INS_ld4r, EA_8BYTE, REG_V20, REG_R24, 16, INS_OPTS_2S); + theEmitter->emitIns_R_R_I(INS_ld4r, EA_16BYTE, REG_V25, REG_R29, 16, INS_OPTS_4S); + theEmitter->emitIns_R_R_I(INS_ld4r, EA_8BYTE, REG_V30, REG_R2, 32, INS_OPTS_1D); + theEmitter->emitIns_R_R_I(INS_ld4r, EA_16BYTE, REG_V3, REG_R7, 32, INS_OPTS_2D); + +#endif // ALL_ARM64_EMITTER_UNIT_TESTS + +#ifdef ALL_ARM64_EMITTER_UNIT_TESTS + // + // Loads to /Stores from one, two, three, or four SIMD&FP registers + // + + genDefineTempLabel(genCreateTempLabel()); + + // ld1 {Vt}[#index], [Xn|SP] + theEmitter->emitIns_R_R_I(INS_ld1, EA_1BYTE, REG_V0, REG_R1, 3); + theEmitter->emitIns_R_R_I(INS_ld1, EA_2BYTE, REG_V2, REG_R3, 2); + theEmitter->emitIns_R_R_I(INS_ld1, EA_4BYTE, REG_V4, REG_R5, 1); + theEmitter->emitIns_R_R_I(INS_ld1, EA_8BYTE, REG_V6, REG_R7, 0); + + // ld2 {Vt, Vt2}[#index], [Xn|SP] + theEmitter->emitIns_R_R_I(INS_ld2, EA_1BYTE, REG_V0, REG_R2, 4); + theEmitter->emitIns_R_R_I(INS_ld2, EA_2BYTE, REG_V3, REG_R5, 3); + theEmitter->emitIns_R_R_I(INS_ld2, EA_4BYTE, REG_V6, REG_R8, 2); + theEmitter->emitIns_R_R_I(INS_ld2, EA_8BYTE, REG_V9, REG_R11, 1); + + // ld3 {Vt, Vt2, Vt3}[#index], [Xn|SP] + theEmitter->emitIns_R_R_I(INS_ld3, EA_1BYTE, REG_V0, REG_R3, 5); + theEmitter->emitIns_R_R_I(INS_ld3, EA_2BYTE, REG_V4, REG_R7, 4); + theEmitter->emitIns_R_R_I(INS_ld3, EA_4BYTE, REG_V8, REG_R11, 3); + theEmitter->emitIns_R_R_I(INS_ld3, EA_8BYTE, REG_V12, REG_R15, 0); + + // ld4 {Vt, Vt2, Vt3, Vt4}[#index], [Xn|SP] + theEmitter->emitIns_R_R_I(INS_ld4, EA_1BYTE, REG_V0, REG_R4, 6); + theEmitter->emitIns_R_R_I(INS_ld4, EA_2BYTE, REG_V5, REG_R9, 5); + theEmitter->emitIns_R_R_I(INS_ld4, EA_4BYTE, REG_V10, REG_R14, 0); + theEmitter->emitIns_R_R_I(INS_ld4, EA_8BYTE, REG_V15, REG_R19, 1); + + // st1 {Vt}[#index], [Xn|SP] + theEmitter->emitIns_R_R_I(INS_st1, EA_1BYTE, REG_V0, REG_R1, 7); + theEmitter->emitIns_R_R_I(INS_st1, EA_2BYTE, REG_V2, REG_R3, 6); + theEmitter->emitIns_R_R_I(INS_st1, EA_4BYTE, REG_V4, REG_R5, 1); + theEmitter->emitIns_R_R_I(INS_st1, EA_8BYTE, REG_V6, REG_R7, 0); + + // st2 {Vt, Vt2}[#index], [Xn|SP] + theEmitter->emitIns_R_R_I(INS_st2, EA_1BYTE, REG_V0, REG_R2, 8); + theEmitter->emitIns_R_R_I(INS_st2, EA_2BYTE, REG_V3, REG_R5, 7); + theEmitter->emitIns_R_R_I(INS_st2, EA_4BYTE, REG_V6, REG_R8, 2); + theEmitter->emitIns_R_R_I(INS_st2, EA_8BYTE, REG_V9, REG_R11, 1); + + // st3 {Vt, Vt2, Vt3}[#index], [Xn|SP] + theEmitter->emitIns_R_R_I(INS_st3, EA_1BYTE, REG_V0, REG_R3, 9); + theEmitter->emitIns_R_R_I(INS_st3, EA_2BYTE, REG_V4, REG_R7, 0); + theEmitter->emitIns_R_R_I(INS_st3, EA_4BYTE, REG_V8, REG_R11, 3); + theEmitter->emitIns_R_R_I(INS_st3, EA_8BYTE, REG_V12, REG_R15, 0); + + // st4 {Vt, Vt2, Vt3, Vt4}[#index], [Xn|SP] + theEmitter->emitIns_R_R_I(INS_st4, EA_1BYTE, REG_V0, REG_R4, 10); + theEmitter->emitIns_R_R_I(INS_st4, EA_2BYTE, REG_V5, REG_R9, 1); + theEmitter->emitIns_R_R_I(INS_st4, EA_4BYTE, REG_V10, REG_R14, 0); + theEmitter->emitIns_R_R_I(INS_st4, EA_8BYTE, REG_V15, REG_R19, 1); + +#endif // ALL_ARM64_EMITTER_UNIT_TESTS + +#ifdef ALL_ARM64_EMITTER_UNIT_TESTS + // + // Loads to /Stores from one, two, three, or four SIMD&FP registers + // + + genDefineTempLabel(genCreateTempLabel()); + + // ld1 {Vt}[#index], [Xn|SP], Xm + theEmitter->emitIns_R_R_R_I(INS_ld1, EA_1BYTE, REG_V0, REG_R1, REG_R2, 3, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_R_I(INS_ld1, EA_2BYTE, REG_V3, REG_R4, REG_R5, 2, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_R_I(INS_ld1, EA_4BYTE, REG_V6, REG_R7, REG_R8, 1, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_R_I(INS_ld1, EA_8BYTE, REG_V9, REG_R10, REG_R11, 0, INS_OPTS_POST_INDEX); + + // ld2 {Vt, Vt2}[#index], [Xn|SP], Xm + theEmitter->emitIns_R_R_R_I(INS_ld2, EA_1BYTE, REG_V0, REG_R2, REG_R3, 4, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_R_I(INS_ld2, EA_2BYTE, REG_V4, REG_R6, REG_R7, 3, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_R_I(INS_ld2, EA_4BYTE, REG_V8, REG_R10, REG_R11, 2, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_R_I(INS_ld2, EA_8BYTE, REG_V12, REG_R14, REG_R15, 1, INS_OPTS_POST_INDEX); + + // ld3 {Vt, Vt2, Vt3}[#index], [Xn|SP], Xm + theEmitter->emitIns_R_R_R_I(INS_ld3, EA_1BYTE, REG_V0, REG_R3, REG_R4, 5, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_R_I(INS_ld3, EA_2BYTE, REG_V5, REG_R8, REG_R9, 4, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_R_I(INS_ld3, EA_4BYTE, REG_V10, REG_R13, REG_R14, 3, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_R_I(INS_ld3, EA_8BYTE, REG_V15, REG_R18, REG_R19, 0, INS_OPTS_POST_INDEX); + + // ld4 {Vt, Vt2, Vt3, Vt4}[#index], [Xn|SP], Xm + theEmitter->emitIns_R_R_R_I(INS_ld4, EA_1BYTE, REG_V0, REG_R4, REG_R5, 6, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_R_I(INS_ld4, EA_2BYTE, REG_V6, REG_R10, REG_R11, 5, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_R_I(INS_ld4, EA_4BYTE, REG_V12, REG_R16, REG_R17, 0, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_R_I(INS_ld4, EA_8BYTE, REG_V18, REG_R22, REG_R23, 1, INS_OPTS_POST_INDEX); + + // st1 {Vt}[#index], [Xn|SP], Xm + theEmitter->emitIns_R_R_R_I(INS_st1, EA_1BYTE, REG_V0, REG_R1, REG_R2, 7, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_R_I(INS_st1, EA_2BYTE, REG_V3, REG_R4, REG_R5, 6, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_R_I(INS_st1, EA_4BYTE, REG_V6, REG_R7, REG_R8, 1, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_R_I(INS_st1, EA_8BYTE, REG_V9, REG_R10, REG_R11, 0, INS_OPTS_POST_INDEX); + + // st2 {Vt, Vt2}[#index], [Xn|SP], Xm + theEmitter->emitIns_R_R_R_I(INS_st2, EA_1BYTE, REG_V0, REG_R2, REG_R3, 8, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_R_I(INS_st2, EA_2BYTE, REG_V4, REG_R6, REG_R7, 7, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_R_I(INS_st2, EA_4BYTE, REG_V8, REG_R10, REG_R11, 2, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_R_I(INS_st2, EA_8BYTE, REG_V12, REG_R14, REG_R15, 1, INS_OPTS_POST_INDEX); + + // st3 {Vt, Vt2, Vt3}[#index], [Xn|SP], Xm + theEmitter->emitIns_R_R_R_I(INS_st3, EA_1BYTE, REG_V0, REG_R3, REG_R4, 9, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_R_I(INS_st3, EA_2BYTE, REG_V5, REG_R8, REG_R9, 0, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_R_I(INS_st3, EA_4BYTE, REG_V10, REG_R13, REG_R14, 3, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_R_I(INS_st3, EA_8BYTE, REG_V15, REG_R18, REG_R19, 0, INS_OPTS_POST_INDEX); + + // st4 {Vt, Vt2, Vt3, Vt4}[#index], [Xn|SP], Xm + theEmitter->emitIns_R_R_R_I(INS_st4, EA_1BYTE, REG_V0, REG_R4, REG_R5, 10, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_R_I(INS_st4, EA_2BYTE, REG_V6, REG_R10, REG_R11, 1, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_R_I(INS_st4, EA_4BYTE, REG_V12, REG_R16, REG_R17, 0, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_R_I(INS_st4, EA_8BYTE, REG_V18, REG_R22, REG_R23, 1, INS_OPTS_POST_INDEX); + +#endif // ALL_ARM64_EMITTER_UNIT_TESTS + +#ifdef ALL_ARM64_EMITTER_UNIT_TESTS + // + // Loads to /Stores from one, two, three, or four SIMD&FP registers + // + + genDefineTempLabel(genCreateTempLabel()); + + // ld1 {Vt}[#index], [Xn|SP], #imm + theEmitter->emitIns_R_R_I_I(INS_ld1, EA_1BYTE, REG_V0, REG_R1, 3, 1, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_I_I(INS_ld1, EA_2BYTE, REG_V2, REG_R3, 2, 2, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_I_I(INS_ld1, EA_4BYTE, REG_V4, REG_R5, 1, 4, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_I_I(INS_ld1, EA_8BYTE, REG_V6, REG_R7, 0, 8, INS_OPTS_POST_INDEX); + + // ld2 {Vt, Vt2}[#index], [Xn|SP], #imm + theEmitter->emitIns_R_R_I_I(INS_ld2, EA_1BYTE, REG_V0, REG_R2, 4, 2, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_I_I(INS_ld2, EA_2BYTE, REG_V3, REG_R5, 3, 4, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_I_I(INS_ld2, EA_4BYTE, REG_V6, REG_R8, 2, 8, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_I_I(INS_ld2, EA_8BYTE, REG_V9, REG_R11, 1, 16, INS_OPTS_POST_INDEX); + + // ld3 {Vt, Vt2, Vt3}[#index], [Xn|SP], #imm + theEmitter->emitIns_R_R_I_I(INS_ld3, EA_1BYTE, REG_V0, REG_R3, 5, 3, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_I_I(INS_ld3, EA_2BYTE, REG_V4, REG_R7, 4, 6, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_I_I(INS_ld3, EA_4BYTE, REG_V8, REG_R11, 3, 12, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_I_I(INS_ld3, EA_8BYTE, REG_V12, REG_R15, 0, 24, INS_OPTS_POST_INDEX); + + // ld4 {Vt, Vt2, Vt3, Vt4}[#index], [Xn|SP], #imm + theEmitter->emitIns_R_R_I_I(INS_ld4, EA_1BYTE, REG_V0, REG_R4, 6, 4, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_I_I(INS_ld4, EA_2BYTE, REG_V5, REG_R9, 5, 8, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_I_I(INS_ld4, EA_4BYTE, REG_V10, REG_R14, 0, 16, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_I_I(INS_ld4, EA_8BYTE, REG_V15, REG_R19, 1, 32, INS_OPTS_POST_INDEX); + + // st1 {Vt}[#index], [Xn|SP], #imm + theEmitter->emitIns_R_R_I_I(INS_st1, EA_1BYTE, REG_V0, REG_R1, 3, 1, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_I_I(INS_st1, EA_2BYTE, REG_V2, REG_R3, 2, 2, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_I_I(INS_st1, EA_4BYTE, REG_V4, REG_R5, 1, 4, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_I_I(INS_st1, EA_8BYTE, REG_V6, REG_R7, 0, 8, INS_OPTS_POST_INDEX); + + // st2 {Vt, Vt2}[#index], [Xn|SP], #imm + theEmitter->emitIns_R_R_I_I(INS_st2, EA_1BYTE, REG_V0, REG_R2, 4, 2, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_I_I(INS_st2, EA_2BYTE, REG_V3, REG_R5, 3, 4, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_I_I(INS_st2, EA_4BYTE, REG_V6, REG_R8, 2, 8, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_I_I(INS_st2, EA_8BYTE, REG_V9, REG_R11, 1, 16, INS_OPTS_POST_INDEX); + + // st3 {Vt, Vt2, Vt3}[#index], [Xn|SP], #imm + theEmitter->emitIns_R_R_I_I(INS_st3, EA_1BYTE, REG_V0, REG_R3, 5, 3, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_I_I(INS_st3, EA_2BYTE, REG_V4, REG_R7, 4, 6, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_I_I(INS_st3, EA_4BYTE, REG_V8, REG_R11, 3, 12, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_I_I(INS_st3, EA_8BYTE, REG_V12, REG_R15, 0, 24, INS_OPTS_POST_INDEX); + + // st4 {Vt, Vt2, Vt3, Vt4}[#index], [Xn|SP], #imm + theEmitter->emitIns_R_R_I_I(INS_st4, EA_1BYTE, REG_V0, REG_R4, 6, 4, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_I_I(INS_st4, EA_2BYTE, REG_V5, REG_R9, 5, 8, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_I_I(INS_st4, EA_4BYTE, REG_V10, REG_R14, 0, 16, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_I_I(INS_st4, EA_8BYTE, REG_V15, REG_R19, 1, 32, INS_OPTS_POST_INDEX); + +#endif // ALL_ARM64_EMITTER_UNIT_TESTS + #ifdef ALL_ARM64_EMITTER_UNIT_TESTS // // Compares diff --git a/src/coreclr/src/jit/emit.h b/src/coreclr/src/jit/emit.h index f40b01d0259c7..dda33e19a36ba 100644 --- a/src/coreclr/src/jit/emit.h +++ b/src/coreclr/src/jit/emit.h @@ -1233,6 +1233,8 @@ class emitter #define PERFSCORE_THROUGHPUT_4C 4.0f // slower - 4 cycles #define PERFSCORE_THROUGHPUT_5C 5.0f // slower - 5 cycles #define PERFSCORE_THROUGHPUT_6C 6.0f // slower - 6 cycles +#define PERFSCORE_THROUGHPUT_7C 7.0f // slower - 7 cycles +#define PERFSCORE_THROUGHPUT_8C 8.0f // slower - 8 cycles #define PERFSCORE_THROUGHPUT_9C 9.0f // slower - 9 cycles #define PERFSCORE_THROUGHPUT_10C 10.0f // slower - 10 cycles #define PERFSCORE_THROUGHPUT_13C 13.0f // slower - 13 cycles diff --git a/src/coreclr/src/jit/emitarm64.cpp b/src/coreclr/src/jit/emitarm64.cpp index 49b7b46b7cde9..e31a9ab43b5bc 100644 --- a/src/coreclr/src/jit/emitarm64.cpp +++ b/src/coreclr/src/jit/emitarm64.cpp @@ -227,22 +227,24 @@ void emitter::emitInsSanityCheck(instrDesc* id) assert(insOptsNone(id->idInsOpt()) || insOptsIndexed(id->idInsOpt())); break; - case IF_LS_2D: // LS_2D .Q.............. xx.xssnnnnnttttt Vt Rn - assert(isValidVectorDatasize(id->idOpSize())); - assert(isValidArrangement(id->idOpSize(), id->idInsOpt())); - assert(isVectorRegister(id->idReg1())); - assert(isIntegerRegister(id->idReg2())); - assert(emitGetInsSC(id) == 0); - assert(!id->idIsLclVar()); - break; - - case IF_LS_2E: // LS_2E .Q.............. xx.Sssnnnnnttttt Vt[] Rn - assert(isValidVectorDatasize(id->idOpSize())); - assert(isValidArrangement(id->idOpSize(), id->idInsOpt())); + case IF_LS_2D: // LS_2D .Q.............. ....ssnnnnnttttt Vt Rn + case IF_LS_2E: // LS_2E .Q.............. ....ssnnnnnttttt Vt Rn + case IF_LS_2F: // LS_2F .Q.............. ...Sssnnnnnttttt Vt[] Rn + case IF_LS_2G: // LS_2G .Q.............. ...Sssnnnnnttttt Vt[] Rn assert(isVectorRegister(id->idReg1())); - assert(isIntegerRegister(id->idReg2())); - elemsize = optGetElemsize(id->idInsOpt()); - assert(isValidVectorIndex(id->idOpSize(), elemsize, emitGetInsSC(id))); + assert(isIntegerRegister(id->idReg2())); // SP + if (insOptsAnyArrangement(id->idInsOpt())) + { + datasize = id->idOpSize(); + assert(isValidVectorDatasize(datasize)); + assert(isValidArrangement(id->idOpSize(), id->idInsOpt())); + } + else + { + elemsize = id->idOpSize(); + assert(isValidVectorElemsize(elemsize)); + assert(insOptsNone(id->idInsOpt()) || insOptsPostIndex(id->idInsOpt())); + } assert(!id->idIsLclVar()); break; @@ -304,24 +306,23 @@ void emitter::emitInsSanityCheck(instrDesc* id) assert(insOptsNone(id->idInsOpt())); break; - case IF_LS_3F: // LS_3F .Q.........mmmmm xx.xssnnnnnttttt Vt Rn Rm - assert(isValidVectorDatasize(id->idOpSize())); - assert(isValidArrangement(id->idOpSize(), id->idInsOpt())); - assert(isVectorRegister(id->idReg1())); - assert(isIntegerRegister(id->idReg2())); - assert(isIntegerRegister(id->idReg3())); - assert(emitGetInsSC(id) == 0); - assert(!id->idIsLclVar()); - break; - - case IF_LS_3G: // LS_3G .Q.........mmmmm xx.Sssnnnnnttttt Vt[] Rn Rm - assert(isValidVectorDatasize(id->idOpSize())); - assert(isValidArrangement(id->idOpSize(), id->idInsOpt())); + case IF_LS_3F: // LS_3F .Q.........mmmmm ....ssnnnnnttttt Vt Rn Rm + case IF_LS_3G: // LS_3G .Q.........mmmmm ...Sssnnnnnttttt Vt[] Rn Rm assert(isVectorRegister(id->idReg1())); - assert(isIntegerRegister(id->idReg2())); - assert(isIntegerRegister(id->idReg3())); - elemsize = optGetElemsize(id->idInsOpt()); - assert(isValidVectorIndex(id->idOpSize(), elemsize, emitGetInsSC(id))); + assert(isIntegerRegister(id->idReg2())); // SP + assert(isGeneralRegister(id->idReg3())); + if (insOptsAnyArrangement(id->idInsOpt())) + { + datasize = id->idOpSize(); + assert(isValidVectorDatasize(datasize)); + assert(isValidArrangement(id->idOpSize(), id->idInsOpt())); + } + else + { + elemsize = id->idOpSize(); + assert(isValidVectorElemsize(elemsize)); + assert(insOptsNone(id->idInsOpt()) || insOptsPostIndex(id->idInsOpt())); + } assert(!id->idIsLclVar()); break; @@ -956,14 +957,16 @@ bool emitter::emitInsMayWriteToGCReg(instrDesc* id) case IF_LS_2A: // LS_2A .X.......X...... ......nnnnnttttt Rt Rn case IF_LS_2B: // LS_2B .X.......Xiiiiii iiiiiinnnnnttttt Rt Rn imm(0-4095) case IF_LS_2C: // LS_2C .X.......X.iiiii iiiiP.nnnnnttttt Rt Rn imm(-256..+255) pre/post inc - case IF_LS_2D: // LS_2D .Q.............. xx.xssnnnnnttttt Vt Rn - case IF_LS_2E: // LS_2E .Q.............. xx.Sssnnnnnttttt Vt[] Rn + case IF_LS_2D: // LS_2D .Q.............. ....ssnnnnnttttt Vt Rn + case IF_LS_2E: // LS_2E .Q.............. ....ssnnnnnttttt Vt Rn + case IF_LS_2F: // LS_2F .Q.............. ...Sssnnnnnttttt Vt[] Rn + case IF_LS_2G: // LS_2G .Q.............. ...Sssnnnnnttttt Vt[] Rn case IF_LS_3A: // LS_3A .X.......X.mmmmm xxxS..nnnnnttttt Rt Rn Rm ext(Rm) LSL {} case IF_LS_3B: // LS_3B X............... .aaaaannnnnttttt Rt Ra Rn case IF_LS_3C: // LS_3C X.........iiiiii iaaaaannnnnttttt Rt Ra Rn imm(im7,sh) case IF_LS_3D: // LS_3D .X.......X.mmmmm ......nnnnnttttt Wm Rt Rn - case IF_LS_3F: // LS_3F .Q.........mmmmm xx.xssnnnnnttttt Vt Rn Rm - case IF_LS_3G: // LS_3G .Q.........mmmmm xx.Sssnnnnnttttt Vt[] Rn Rm + case IF_LS_3F: // LS_3F .Q.........mmmmm ....ssnnnnnttttt Vt Rn Rm + case IF_LS_3G: // LS_3G .Q.........mmmmm ...Sssnnnnnttttt Vt[] Rn Rm // For the Store instructions the "target" register is actually a "source" value @@ -1127,7 +1130,6 @@ emitAttr emitter::emitInsTargetRegSize(instrDesc* id) case INS_str: case INS_ldur: case INS_stur: - case INS_ld1: result = id->idOpSize(); break; @@ -1261,11 +1263,17 @@ static const char * const bRegNames[] = }; // clang-format on -/***************************************************************************** - * - * Return a string that represents the given register. - */ - +//------------------------------------------------------------------------ +// emitRegName: Returns a general-purpose register name or SIMD and floating-point scalar register name. +// +// Arguments: +// reg - A general-purpose register or SIMD and floating-point register. +// size - A register size. +// varName - unused parameter. +// +// Return value: +// A string that represents a general-purpose register name or SIMD and floating-point scalar register name. +// const char* emitter::emitRegName(regNumber reg, emitAttr size, bool varName) { assert(reg < REG_COUNT); @@ -1301,11 +1309,15 @@ const char* emitter::emitRegName(regNumber reg, emitAttr size, bool varName) return rn; } -/***************************************************************************** - * - * Return a string that represents the given register. - */ - +//------------------------------------------------------------------------ +// emitVectorRegName: Returns a SIMD vector register name. +// +// Arguments: +// reg - A SIMD and floating-point register. +// +// Return value: +// A string that represents a SIMD vector register name. +// const char* emitter::emitVectorRegName(regNumber reg) { assert((reg >= REG_V0) && (reg <= REG_V31)); @@ -1314,6 +1326,7 @@ const char* emitter::emitVectorRegName(regNumber reg) return vRegNames[index]; } + #endif // DEBUG /***************************************************************************** @@ -1533,6 +1546,7 @@ emitter::code_t emitter::emitInsCode(instruction ins, insFormat fmt) const static insFormat formatEncode9[9] = {IF_DR_2E, IF_DR_2G, IF_DI_1B, IF_DI_1D, IF_DV_3C, IF_DV_2B, IF_DV_2C, IF_DV_2E, IF_DV_2F}; const static insFormat formatEncode6A[6] = {IF_DR_3A, IF_DR_3B, IF_DR_3C, IF_DI_2A, IF_DV_3A, IF_DV_3E}; + const static insFormat formatEncode6B[6] = {IF_LS_2D, IF_LS_3F, IF_LS_2E, IF_LS_2F, IF_LS_3G, IF_LS_2G}; const static insFormat formatEncode5A[5] = {IF_LS_2A, IF_LS_2B, IF_LS_2C, IF_LS_3A, IF_LS_1A}; const static insFormat formatEncode5B[5] = {IF_DV_2G, IF_DV_2H, IF_DV_2I, IF_DV_1A, IF_DV_1B}; const static insFormat formatEncode5C[5] = {IF_DR_3A, IF_DR_3B, IF_DI_2C, IF_DV_3C, IF_DV_1B}; @@ -1545,7 +1559,6 @@ emitter::code_t emitter::emitInsCode(instruction ins, insFormat fmt) const static insFormat formatEncode4G[4] = {IF_DR_2E, IF_DR_2F, IF_DV_2M, IF_DV_2L}; const static insFormat formatEncode4H[4] = {IF_DV_3E, IF_DV_3A, IF_DV_2L, IF_DV_2M}; const static insFormat formatEncode4I[4] = {IF_DV_3D, IF_DV_3B, IF_DV_2G, IF_DV_2A}; - const static insFormat formatEncode4J[4] = {IF_LS_2D, IF_LS_3F, IF_LS_2E, IF_LS_3G}; const static insFormat formatEncode3A[3] = {IF_DR_3A, IF_DR_3B, IF_DI_2C}; const static insFormat formatEncode3B[3] = {IF_DR_2A, IF_DR_2B, IF_DI_1C}; const static insFormat formatEncode3C[3] = {IF_DR_3A, IF_DR_3B, IF_DV_3C}; @@ -1555,6 +1568,7 @@ emitter::code_t emitter::emitInsCode(instruction ins, insFormat fmt) const static insFormat formatEncode3G[3] = {IF_DV_2A, IF_DV_2G, IF_DV_2I}; const static insFormat formatEncode3H[3] = {IF_DR_3A, IF_DV_3A, IF_DV_3AI}; const static insFormat formatEncode3I[3] = {IF_DR_2E, IF_DR_2F, IF_DV_2M}; + const static insFormat formatEncode3J[3] = {IF_LS_2D, IF_LS_3F, IF_LS_2E}; const static insFormat formatEncode2A[2] = {IF_DR_2E, IF_DR_2F}; const static insFormat formatEncode2B[2] = {IF_DR_3A, IF_DR_3B}; const static insFormat formatEncode2C[2] = {IF_DR_3A, IF_DI_2D}; @@ -1602,6 +1616,17 @@ emitter::code_t emitter::emitInsCode(instruction ins, insFormat fmt) } break; + case IF_EN6B: + for (index = 0; index < 6; index++) + { + if (fmt == formatEncode6B[index]) + { + encoding_found = true; + break; + } + } + break; + case IF_EN5A: for (index = 0; index < 5; index++) { @@ -1734,17 +1759,6 @@ emitter::code_t emitter::emitInsCode(instruction ins, insFormat fmt) } break; - case IF_EN4J: - for (index = 0; index < 4; index++) - { - if (fmt == formatEncode4J[index]) - { - encoding_found = true; - break; - } - } - break; - case IF_EN3A: for (index = 0; index < 3; index++) { @@ -1844,6 +1858,17 @@ emitter::code_t emitter::emitInsCode(instruction ins, insFormat fmt) } break; + case IF_EN3J: + for (index = 0; index < 3; index++) + { + if (fmt == formatEncode3J[index]) + { + encoding_found = true; + break; + } + } + break; + case IF_EN2A: for (index = 0; index < 2; index++) { @@ -3218,6 +3243,59 @@ emitter::code_t emitter::emitInsCode(instruction ins, insFormat fmt) return false; } +//------------------------------------------------------------------------ +// insGetLoadStoreRegisterListSize: Returns a size of the register list a given instruction operates on. +// +// Arguments: +// ins - A Load/Store Vector instruction (e.g. ld1 (2 registers), ld1r, st1). +// +// Return value: +// A number of consecutive SIMD and floating-point registers the instruction loads to/store from. +// +/*static*/ unsigned emitter::insGetLoadStoreRegisterListSize(instruction ins) +{ + unsigned registerListSize = 0; + + switch (ins) + { + case INS_ld1: + case INS_ld1r: + case INS_st1: + registerListSize = 1; + break; + + case INS_ld1_2regs: + case INS_ld2: + case INS_ld2r: + case INS_st1_2regs: + case INS_st2: + registerListSize = 2; + break; + + case INS_ld1_3regs: + case INS_ld3: + case INS_ld3r: + case INS_st1_3regs: + case INS_st3: + registerListSize = 3; + break; + + case INS_ld1_4regs: + case INS_ld4: + case INS_ld4r: + case INS_st1_4regs: + case INS_st4: + registerListSize = 4; + break; + + default: + assert(!"Unexpected instruction"); + break; + } + + return registerListSize; +} + // For the given 'arrangement' returns the 'datasize' specified by the vector register arrangement // asserts and returns EA_UNKNOWN if an invalid 'arrangement' value is passed // @@ -4476,15 +4554,37 @@ void emitter::emitIns_R_R( fmt = IF_DV_2P; break; + case INS_ld2: + case INS_ld3: + case INS_ld4: + case INS_st2: + case INS_st3: + case INS_st4: + assert(opt != INS_OPTS_1D); // .1D format only permitted with LD1 & ST1 + __fallthrough; + case INS_ld1: - { + case INS_ld1_2regs: + case INS_ld1_3regs: + case INS_ld1_4regs: + case INS_st1: + case INS_st1_2regs: + case INS_st1_3regs: + case INS_st1_4regs: + case INS_ld1r: + case INS_ld2r: + case INS_ld3r: + case INS_ld4r: assert(isVectorRegister(reg1)); - assert(isIntegerRegister(reg2)); + assert(isGeneralRegisterOrSP(reg2)); assert(isValidVectorDatasize(size)); assert(isValidArrangement(size, opt)); - fmt = IF_LS_2D; + + // Load/Store multiple structures base register + // Load single structure and replicate base register + reg2 = encodingSPtoZR(reg2); + fmt = IF_LS_2D; break; - } default: unreached(); @@ -4627,6 +4727,7 @@ void emitter::emitIns_R_R_I( { bool canEncode; bitMaskImm bmi; + unsigned registerListSize; case INS_mov: // Check for the 'mov' aliases for the vector registers @@ -5023,6 +5124,72 @@ void emitter::emitIns_R_R_I( isLdSt = true; break; + case INS_ld2: + case INS_ld3: + case INS_ld4: + case INS_st2: + case INS_st3: + case INS_st4: + assert(opt != INS_OPTS_1D); // .1D format only permitted with LD1 & ST1 + __fallthrough; + + case INS_ld1: + case INS_ld1_2regs: + case INS_ld1_3regs: + case INS_ld1_4regs: + case INS_st1: + case INS_st1_2regs: + case INS_st1_3regs: + case INS_st1_4regs: + assert(isVectorRegister(reg1)); + assert(isGeneralRegisterOrSP(reg2)); + + reg2 = encodingSPtoZR(reg2); + + if (insOptsAnyArrangement(opt)) + { + registerListSize = insGetLoadStoreRegisterListSize(ins); + assert(isValidVectorDatasize(size)); + assert(isValidArrangement(size, opt)); + assert((size * registerListSize) == imm); + + // Load/Store multiple structures post-indexed by an immediate + fmt = IF_LS_2E; + } + else + { + assert(insOptsNone(opt)); + assert((ins != INS_ld1_2regs) && (ins != INS_ld1_3regs) && (ins != INS_ld1_4regs) && + (ins != INS_st1_2regs) && (ins != INS_st1_3regs) && (ins != INS_st1_4regs)); + + elemsize = size; + assert(isValidVectorElemsize(elemsize)); + assert(isValidVectorIndex(EA_16BYTE, elemsize, imm)); + + // Load/Store single structure base register + fmt = IF_LS_2F; + } + break; + + case INS_ld1r: + case INS_ld2r: + case INS_ld3r: + case INS_ld4r: + assert(isVectorRegister(reg1)); + assert(isGeneralRegisterOrSP(reg2)); + + assert(isValidVectorDatasize(size)); + assert(isValidArrangement(size, opt)); + + elemsize = optGetElemsize(opt); + registerListSize = insGetLoadStoreRegisterListSize(ins); + assert((elemsize * registerListSize) == imm); + + // Load single structure and replicate post-indexed by an immediate + reg2 = encodingSPtoZR(reg2); + fmt = IF_LS_2E; + break; + default: unreached(); break; @@ -5641,6 +5808,38 @@ void emitter::emitIns_R_R_R( fmt = IF_DV_3F; break; + case INS_ld2: + case INS_ld3: + case INS_ld4: + case INS_st2: + case INS_st3: + case INS_st4: + assert(opt != INS_OPTS_1D); // .1D format only permitted with LD1 & ST1 + __fallthrough; + + case INS_ld1: + case INS_ld1_2regs: + case INS_ld1_3regs: + case INS_ld1_4regs: + case INS_st1: + case INS_st1_2regs: + case INS_st1_3regs: + case INS_st1_4regs: + case INS_ld1r: + case INS_ld2r: + case INS_ld3r: + case INS_ld4r: + assert(isVectorRegister(reg1)); + assert(isGeneralRegisterOrSP(reg2)); + assert(isGeneralRegister(reg3)); + assert(isValidArrangement(size, opt)); + + // Load/Store multiple structures post-indexed by a register + // Load single structure and replicate post-indexed by a register + reg2 = encodingSPtoZR(reg2); + fmt = IF_LS_3F; + break; + default: unreached(); break; @@ -5812,6 +6011,29 @@ void emitter::emitIns_R_R_R_I(instruction ins, isLdSt = true; break; + case INS_ld1: + case INS_ld2: + case INS_ld3: + case INS_ld4: + case INS_st1: + case INS_st2: + case INS_st3: + case INS_st4: + assert(isVectorRegister(reg1)); + assert(isGeneralRegisterOrSP(reg2)); + assert(isGeneralRegister(reg3)); + + assert(insOptsPostIndex(opt)); + + elemsize = size; + assert(isValidVectorElemsize(elemsize)); + assert(isValidVectorIndex(EA_16BYTE, elemsize, imm)); + + // Load/Store single structure post-indexed by a register + reg2 = encodingSPtoZR(reg2); + fmt = IF_LS_3G; + break; + default: unreached(); break; @@ -6082,7 +6304,8 @@ void emitter::emitIns_R_R_R_Ext(instruction ins, * Add an instruction referencing two registers and two constants. */ -void emitter::emitIns_R_R_I_I(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, int imm1, int imm2) +void emitter::emitIns_R_R_I_I( + instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, int imm1, int imm2, insOpts opt) { emitAttr size = EA_SIZE(attr); emitAttr elemsize = EA_UNKNOWN; @@ -6095,6 +6318,7 @@ void emitter::emitIns_R_R_I_I(instruction ins, emitAttr attr, regNumber reg1, re int lsb; int width; bitMaskImm bmi; + unsigned registerListSize; case INS_bfm: case INS_sbfm: @@ -6103,6 +6327,7 @@ void emitter::emitIns_R_R_I_I(instruction ins, emitAttr attr, regNumber reg1, re assert(isGeneralRegister(reg2)); assert(isValidImmShift(imm1, size)); assert(isValidImmShift(imm2, size)); + assert(insOptsNone(opt)); bmi.immNRS = 0; bmi.immN = (size == EA_8BYTE); bmi.immR = imm1; @@ -6120,6 +6345,7 @@ void emitter::emitIns_R_R_I_I(instruction ins, emitAttr attr, regNumber reg1, re width = imm2 - 1; assert(isValidImmShift(lsb, size)); assert(isValidImmShift(width, size)); + assert(insOptsNone(opt)); bmi.immNRS = 0; bmi.immN = (size == EA_8BYTE); bmi.immR = lsb; @@ -6137,6 +6363,7 @@ void emitter::emitIns_R_R_I_I(instruction ins, emitAttr attr, regNumber reg1, re width = imm2 + imm1 - 1; assert(isValidImmShift(lsb, size)); assert(isValidImmShift(width, size)); + assert(insOptsNone(opt)); bmi.immNRS = 0; bmi.immN = (size == EA_8BYTE); bmi.immR = imm1; @@ -6153,10 +6380,36 @@ void emitter::emitIns_R_R_I_I(instruction ins, emitAttr attr, regNumber reg1, re assert(isValidVectorElemsize(elemsize)); assert(isValidVectorIndex(EA_16BYTE, elemsize, imm1)); assert(isValidVectorIndex(EA_16BYTE, elemsize, imm2)); + assert(insOptsNone(opt)); immOut = (imm1 << 4) + imm2; fmt = IF_DV_2F; break; + case INS_ld1: + case INS_ld2: + case INS_ld3: + case INS_ld4: + case INS_st1: + case INS_st2: + case INS_st3: + case INS_st4: + assert(isVectorRegister(reg1)); + assert(isGeneralRegisterOrSP(reg2)); + + elemsize = size; + assert(isValidVectorElemsize(elemsize)); + assert(isValidVectorIndex(EA_16BYTE, elemsize, imm1)); + + registerListSize = insGetLoadStoreRegisterListSize(ins); + assert((elemsize * registerListSize) == (unsigned)imm2); + assert(insOptsPostIndex(opt)); + + // Load/Store single structure post-indexed by an immediate + reg2 = encodingSPtoZR(reg2); + immOut = imm1; + fmt = IF_LS_2G; + break; + default: unreached(); break; @@ -6167,6 +6420,7 @@ void emitter::emitIns_R_R_I_I(instruction ins, emitAttr attr, regNumber reg1, re id->idIns(ins); id->idInsFmt(fmt); + id->idInsOpt(opt); id->idReg1(reg1); id->idReg2(reg2); @@ -9495,27 +9749,28 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) dst += emitOutput_Instr(dst, code); break; - case IF_LS_2D: // LS_2D .Q.............. xx.xssnnnnnttttt Vt Rn + case IF_LS_2D: // LS_2D .Q.............. ....ssnnnnnttttt Vt Rn + case IF_LS_2E: // LS_2E .Q.............. ....ssnnnnnttttt Vt Rn elemsize = optGetElemsize(id->idInsOpt()); code = emitInsCode(ins, fmt); code |= insEncodeVectorsize(id->idOpSize()); // Q - code |= 0x5000; // xxx - We only support the one register variant right now - code |= insEncodeVLSElemsize(elemsize); // ss - code |= insEncodeReg_Rn(id->idReg2()); // nnnnn - code |= insEncodeReg_Vt(id->idReg1()); // ttttt + code |= insEncodeVLSElemsize(elemsize); // ss + code |= insEncodeReg_Rn(id->idReg2()); // nnnnn + code |= insEncodeReg_Vt(id->idReg1()); // ttttt dst += emitOutput_Instr(dst, code); break; - case IF_LS_2E: // LS_2E .Q.............. xx.Sssnnnnnttttt Vt[] Rn - elemsize = optGetElemsize(id->idInsOpt()); - imm = emitGetInsSC(id); + case IF_LS_2F: // LS_2F .Q.............. ...Sssnnnnnttttt Vt[] Rn + case IF_LS_2G: // LS_2G .Q.............. ...Sssnnnnnttttt Vt[] Rn + elemsize = id->idOpSize(); + index = id->idSmallCns(); code = emitInsCode(ins, fmt); - code |= insEncodeVLSIndex(elemsize, imm); // Q xx S ss - code |= insEncodeReg_Rn(id->idReg2()); // nnnnn - code |= insEncodeReg_Vt(id->idReg1()); // ttttt + code |= insEncodeVLSIndex(elemsize, index); // Q xx S ss + code |= insEncodeReg_Rn(id->idReg2()); // nnnnn + code |= insEncodeReg_Vt(id->idReg1()); // ttttt dst += emitOutput_Instr(dst, code); break; @@ -9617,29 +9872,28 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) dst += emitOutput_Instr(dst, code); break; - case IF_LS_3F: // LS_3F .Q.........mmmmm xx.xssnnnnnttttt Vt Rn Rm + case IF_LS_3F: // LS_3F .Q.........mmmmm ....ssnnnnnttttt Vt Rn Rm elemsize = optGetElemsize(id->idInsOpt()); code = emitInsCode(ins, fmt); - code |= insEncodeReg_Vt(id->idReg1()); // ttttt - code |= insEncodeReg_Rn(id->idReg2()); // nnnnn - code |= insEncodeVLSElemsize(elemsize); // ss - code |= 0x5000; // xx.x - We only support the one register variant right now - code |= insEncodeReg_Rm(id->idReg3()); // mmmmm code |= insEncodeVectorsize(id->idOpSize()); // Q + code |= insEncodeReg_Rm(id->idReg3()); // mmmmm + code |= insEncodeVLSElemsize(elemsize); // ss + code |= insEncodeReg_Rn(id->idReg2()); // nnnnn + code |= insEncodeReg_Vt(id->idReg1()); // ttttt dst += emitOutput_Instr(dst, code); break; - case IF_LS_3G: // LS_3G .Q.........mmmmm xx.Sssnnnnnttttt Vt[] Rn Rm - elemsize = optGetElemsize(id->idInsOpt()); - imm = emitGetInsSC(id); + case IF_LS_3G: // LS_3G .Q.........mmmmm ...Sssnnnnnttttt Vt[] Rn Rm + elemsize = id->idOpSize(); + index = id->idSmallCns(); code = emitInsCode(ins, fmt); - code |= insEncodeVLSIndex(elemsize, imm); // Q xx S ss - code |= insEncodeReg_Rm(id->idReg3()); // mmmmm - code |= insEncodeReg_Rn(id->idReg2()); // nnnnn - code |= insEncodeReg_Vt(id->idReg1()); // ttttt + code |= insEncodeVLSIndex(elemsize, index); // Q xx S ss + code |= insEncodeReg_Rm(id->idReg3()); // mmmmm + code |= insEncodeReg_Rn(id->idReg2()); // nnnnn + code |= insEncodeReg_Vt(id->idReg1()); // ttttt dst += emitOutput_Instr(dst, code); break; @@ -10763,10 +11017,9 @@ void emitter::emitDispLSExtendOpts(insOpts opt) assert(!"Bad value"); } -/***************************************************************************** - * - * Display a register - */ +//------------------------------------------------------------------------ +// emitDispReg: Display a general-purpose register name or SIMD and floating-point scalar register name +// void emitter::emitDispReg(regNumber reg, emitAttr attr, bool addComma) { emitAttr size = EA_SIZE(attr); @@ -10776,10 +11029,9 @@ void emitter::emitDispReg(regNumber reg, emitAttr attr, bool addComma) printf(", "); } -/***************************************************************************** - * - * Display a vector register with an arrangement suffix - */ +//------------------------------------------------------------------------ +// emitDispVectorReg: Display a SIMD vector register name with with an arrangement suffix +// void emitter::emitDispVectorReg(regNumber reg, insOpts opt, bool addComma) { assert(isVectorRegister(reg)); @@ -10790,54 +11042,88 @@ void emitter::emitDispVectorReg(regNumber reg, insOpts opt, bool addComma) printf(", "); } -/***************************************************************************** - * - * Display an vector register index suffix - */ +//------------------------------------------------------------------------ +// emitDispVectorRegIndex: Display a SIMD vector register name with element index +// void emitter::emitDispVectorRegIndex(regNumber reg, emitAttr elemsize, ssize_t index, bool addComma) { assert(isVectorRegister(reg)); printf(emitVectorRegName(reg)); + emitDispElemsize(elemsize); + printf("[%d]", index); - switch (elemsize) + if (addComma) + printf(", "); +} + +//------------------------------------------------------------------------ +// emitDispVectorRegList: Display a SIMD vector register list +// +void emitter::emitDispVectorRegList(regNumber firstReg, unsigned listSize, insOpts opt, bool addComma) +{ + assert(isVectorRegister(firstReg)); + + regNumber currReg = firstReg; + + printf("{"); + for (unsigned i = 0; i < listSize; i++) { - case EA_1BYTE: - printf(".b"); - break; - case EA_2BYTE: - printf(".h"); - break; - case EA_4BYTE: - printf(".s"); - break; - case EA_8BYTE: - printf(".d"); - break; - default: - assert(!"invalid elemsize"); - break; + const bool notLastRegister = (i != listSize - 1); + emitDispVectorReg(currReg, opt, notLastRegister); + currReg = (currReg == REG_V31) ? REG_V0 : REG_NEXT(currReg); } - - printf("[%d]", index); + printf("}"); if (addComma) + { printf(", "); + } } -/***************************************************************************** - * - * Display an arrangement suffix - */ -void emitter::emitDispArrangement(insOpts opt) +//------------------------------------------------------------------------ +// emitDispVectorElemList: Display a SIMD vector element list +// +void emitter::emitDispVectorElemList( + regNumber firstReg, unsigned listSize, emitAttr elemsize, unsigned index, bool addComma) { - const char* str = "???"; + assert(isVectorRegister(firstReg)); - switch (opt) + regNumber currReg = firstReg; + + printf("{"); + for (unsigned i = 0; i < listSize; i++) { - case INS_OPTS_8B: - str = "8b"; - break; - case INS_OPTS_16B: + printf(emitVectorRegName(currReg)); + emitDispElemsize(elemsize); + const bool notLastRegister = (i != listSize - 1); + if (notLastRegister) + { + printf(", "); + } + currReg = (currReg == REG_V31) ? REG_V0 : REG_NEXT(currReg); + } + printf("}"); + printf("[%d]", index); + + if (addComma) + { + printf(", "); + } +} + +//------------------------------------------------------------------------ +// emitDispArrangement: Display a SIMD vector arrangement suffix +// +void emitter::emitDispArrangement(insOpts opt) +{ + const char* str = "???"; + + switch (opt) + { + case INS_OPTS_8B: + str = "8b"; + break; + case INS_OPTS_16B: str = "16b"; break; case INS_OPTS_4H: @@ -10866,10 +11152,39 @@ void emitter::emitDispArrangement(insOpts opt) printf(str); } -/***************************************************************************** - * - * Display a register with an optional shift operation - */ +//------------------------------------------------------------------------ +// emitDispElemsize: Display a SIMD vector element suffix +// +void emitter::emitDispElemsize(emitAttr elemsize) +{ + const char* str = "???"; + + switch (elemsize) + { + case EA_1BYTE: + str = ".b"; + break; + case EA_2BYTE: + str = ".h"; + break; + case EA_4BYTE: + str = ".s"; + break; + case EA_8BYTE: + str = ".d"; + break; + + default: + assert(!"invalid elemsize"); + break; + } + + printf(str); +} + +//------------------------------------------------------------------------ +// emitDispShiftedReg: Display a register with an optional shift operation +// void emitter::emitDispShiftedReg(regNumber reg, insOpts opt, ssize_t imm, emitAttr attr) { emitAttr size = EA_SIZE(attr); @@ -11136,6 +11451,7 @@ void emitter::emitDispIns( emitAttr dstsize; ssize_t index; ssize_t index2; + unsigned registerListSize; case IF_BI_0A: // BI_0A ......iiiiiiiiii iiiiiiiiiiiiiiii simm26:00 case IF_BI_0B: // BI_0B ......iiiiiiiiii iiiiiiiiiii..... simm19:00 @@ -11310,17 +11626,41 @@ void emitter::emitDispIns( emitDispAddrRI(id->idReg2(), id->idInsOpt(), imm); break; - case IF_LS_2D: // LS_2D .Q.............. xx.xssnnnnnttttt Vt Rn - assert(emitGetInsSC(id) == 0); - emitDispReg(id->idReg1(), emitInsTargetRegSize(id), true); - emitDispAddrRI(id->idReg2(), id->idInsOpt(), 0); + case IF_LS_2D: // LS_2D .Q.............. ....ssnnnnnttttt Vt Rn + case IF_LS_2E: // LS_2E .Q.............. ....ssnnnnnttttt Vt Rn + registerListSize = insGetLoadStoreRegisterListSize(id->idIns()); + emitDispVectorRegList(id->idReg1(), registerListSize, id->idInsOpt(), true); + + if (fmt == IF_LS_2D) + { + // Load/Store multiple structures base register + // Load single structure and replicate base register + emitDispAddrRI(id->idReg2(), INS_OPTS_NONE, 0); + } + else + { + // Load/Store multiple structures post-indexed by an immediate + // Load single structure and replicate post-indexed by an immediate + emitDispAddrRI(id->idReg2(), INS_OPTS_POST_INDEX, id->idSmallCns()); + } break; - case IF_LS_2E: // LS_2E .Q.............. xx.Sssnnnnnttttt Vt[] Rn - assert(insOptsNone(id->idInsOpt())); - assert(emitGetInsSC(id) == 0); - emitDispReg(id->idReg1(), emitInsTargetRegSize(id), true); - emitDispAddrRI(id->idReg2(), id->idInsOpt(), 0); + case IF_LS_2F: // LS_2F .Q.............. ...Sssnnnnnttttt Vt[] Rn + case IF_LS_2G: // LS_2G .Q.............. ...Sssnnnnnttttt Vt[] Rn + registerListSize = insGetLoadStoreRegisterListSize(id->idIns()); + elemsize = id->idOpSize(); + emitDispVectorElemList(id->idReg1(), registerListSize, elemsize, id->idSmallCns(), true); + + if (fmt == IF_LS_2F) + { + // Load/Store single structure base register + emitDispAddrRI(id->idReg2(), INS_OPTS_NONE, 0); + } + else + { + // Load/Store single structure post-indexed by an immediate + emitDispAddrRI(id->idReg2(), INS_OPTS_POST_INDEX, (registerListSize * elemsize)); + } break; case IF_LS_3A: // LS_3A .X.......X.mmmmm oooS..nnnnnttttt Rt Rn Rm ext(Rm) LSL {} @@ -11369,20 +11709,27 @@ void emitter::emitDispIns( emitDispAddrRI(id->idReg3(), id->idInsOpt(), 0); break; - case IF_LS_3F: // LS_3F .Q.........mmmmm xx.xssnnnnnttttt Vt Rn Rm - assert(insOptsNone(id->idInsOpt())); - assert(emitGetInsSC(id) == 0); - emitDispReg(id->idReg1(), emitInsTargetRegSize(id), true); - emitDispReg(id->idReg2(), emitInsTargetRegSize(id), true); - emitDispAddrRI(id->idReg3(), id->idInsOpt(), 0); - break; + case IF_LS_3F: // LS_3F .Q.........mmmmm ....ssnnnnnttttt Vt Rn Rm + case IF_LS_3G: // LS_3G .Q.........mmmmm ...Sssnnnnnttttt Vt[] Rn Rm + registerListSize = insGetLoadStoreRegisterListSize(id->idIns()); - case IF_LS_3G: // LS_3G .Q.........mmmmm xx.Sssnnnnnttttt Vt[] Rn Rm - assert(insOptsNone(id->idInsOpt())); - assert(emitGetInsSC(id) == 0); - emitDispReg(id->idReg1(), emitInsTargetRegSize(id), true); - emitDispReg(id->idReg2(), emitInsTargetRegSize(id), true); - emitDispAddrRI(id->idReg3(), id->idInsOpt(), 0); + if (fmt == IF_LS_3F) + { + // Load/Store multiple structures post-indexed by a register + // Load single structure and replicate post-indexed by a register + emitDispVectorRegList(id->idReg1(), registerListSize, id->idInsOpt(), true); + } + else + { + // Load/Store single structure post-indexed by a register + elemsize = id->idOpSize(); + emitDispVectorElemList(id->idReg1(), registerListSize, elemsize, id->idSmallCns(), true); + } + + printf("["); + emitDispReg(encodingZRtoSP(id->idReg2()), EA_8BYTE, false); + printf("], "); + emitDispReg(id->idReg3(), EA_8BYTE, false); break; case IF_DI_1A: // DI_1A X.......shiiiiii iiiiiinnnnn..... Rn imm(i12,sh) @@ -12305,6 +12652,8 @@ void emitter::getMemoryOperation(instrDesc* id, unsigned* pMemAccessKind, bool* case IF_LS_2C: case IF_LS_2D: case IF_LS_2E: + case IF_LS_2F: + case IF_LS_2G: case IF_LS_3A: case IF_LS_3F: case IF_LS_3G: @@ -12706,22 +13055,375 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins } break; - case IF_LS_2D: // ld1 (vector - multiple structures) - case IF_LS_2E: // ld1 (vector - single structure) - case IF_LS_3F: // ld1 (vector - multiple structures) - case IF_LS_3G: // ld1 (vector - single structure) - if (id->idOpSize() == EA_8BYTE) + case IF_LS_2D: + case IF_LS_2E: + case IF_LS_3F: + // Load/Store multiple structures + // Load single structure and replicate + switch (ins) { - // D-form - result.insThroughput = PERFSCORE_THROUGHPUT_1C; - result.insLatency = PERFSCORE_LATENCY_3C; + case INS_ld1: + if (id->idOpSize() == EA_8BYTE) + { + // D-form + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + result.insLatency = PERFSCORE_LATENCY_3C; + } + else + { + // Q-form + assert(id->idOpSize() == EA_16BYTE); + result.insThroughput = PERFSCORE_THROUGHPUT_2C; + result.insLatency = PERFSCORE_LATENCY_4C; + } + break; + + case INS_ld1_2regs: + case INS_ld2: + if (id->idOpSize() == EA_8BYTE) + { + // D-form + result.insThroughput = PERFSCORE_THROUGHPUT_2C; + result.insLatency = PERFSCORE_LATENCY_4C; + } + else + { + // Q-form + assert(id->idOpSize() == EA_16BYTE); + result.insThroughput = PERFSCORE_THROUGHPUT_4C; + result.insLatency = PERFSCORE_LATENCY_6C; + } + break; + + case INS_ld1_3regs: + if (id->idOpSize() == EA_8BYTE) + { + // D-form + result.insThroughput = PERFSCORE_THROUGHPUT_3C; + result.insLatency = PERFSCORE_LATENCY_5C; + } + else + { + // Q-form + assert(id->idOpSize() == EA_16BYTE); + result.insThroughput = PERFSCORE_THROUGHPUT_6C; + result.insLatency = PERFSCORE_LATENCY_8C; + } + break; + + case INS_ld1_4regs: + if (id->idOpSize() == EA_8BYTE) + { + // D-form + result.insThroughput = PERFSCORE_THROUGHPUT_4C; + result.insLatency = PERFSCORE_LATENCY_6C; + } + else + { + // Q-form + assert(id->idOpSize() == EA_16BYTE); + result.insThroughput = PERFSCORE_THROUGHPUT_8C; + result.insLatency = PERFSCORE_LATENCY_10C; + } + break; + + case INS_ld3: + if (id->idOpSize() == EA_8BYTE) + { + // D-form + if (optGetElemsize(id->idInsOpt()) == EA_4BYTE) + { + // S + result.insThroughput = PERFSCORE_THROUGHPUT_3C; + result.insLatency = PERFSCORE_LATENCY_5C; + } + else + { + // B/H + result.insThroughput = PERFSCORE_THROUGHPUT_4C; + result.insLatency = PERFSCORE_LATENCY_6C; + } + } + else + { + // Q-form + assert(id->idOpSize() == EA_16BYTE); + if ((optGetElemsize(id->idInsOpt()) == EA_4BYTE) || + (optGetElemsize(id->idInsOpt()) == EA_8BYTE)) + { + // S/D + result.insThroughput = PERFSCORE_THROUGHPUT_6C; + result.insLatency = PERFSCORE_LATENCY_8C; + } + else + { + // B/H + result.insThroughput = PERFSCORE_THROUGHPUT_7C; + result.insLatency = PERFSCORE_LATENCY_9C; + } + } + break; + + case INS_ld4: + if (id->idOpSize() == EA_8BYTE) + { + // D-form + if (optGetElemsize(id->idInsOpt()) == EA_4BYTE) + { + // S + result.insThroughput = PERFSCORE_THROUGHPUT_4C; + result.insLatency = PERFSCORE_LATENCY_6C; + } + else + { + // B/H + result.insThroughput = PERFSCORE_THROUGHPUT_5C; + result.insLatency = PERFSCORE_LATENCY_7C; + } + } + else + { + // Q-form + assert(id->idOpSize() == EA_16BYTE); + if ((optGetElemsize(id->idInsOpt()) == EA_4BYTE) || + (optGetElemsize(id->idInsOpt()) == EA_8BYTE)) + { + // S/D + result.insThroughput = PERFSCORE_THROUGHPUT_8C; + result.insLatency = PERFSCORE_LATENCY_10C; + } + else + { + // B/H + result.insThroughput = PERFSCORE_THROUGHPUT_9C; + result.insLatency = PERFSCORE_LATENCY_11C; + } + } + break; + + case INS_ld1r: + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + result.insLatency = PERFSCORE_LATENCY_3C; + break; + + case INS_ld2r: + if (id->idOpSize() == EA_8BYTE) + { + // D + result.insThroughput = PERFSCORE_THROUGHPUT_2C; + result.insLatency = PERFSCORE_LATENCY_4C; + } + else + { + // B/H/S + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + result.insLatency = PERFSCORE_LATENCY_3C; + } + break; + + case INS_ld3r: + if (id->idOpSize() == EA_8BYTE) + { + // D + result.insThroughput = PERFSCORE_THROUGHPUT_3C; + result.insLatency = PERFSCORE_LATENCY_5C; + } + else + { + // B/H/S + result.insThroughput = PERFSCORE_THROUGHPUT_2C; + result.insLatency = PERFSCORE_LATENCY_4C; + } + break; + + case INS_ld4r: + if (id->idOpSize() == EA_8BYTE) + { + // D + result.insThroughput = PERFSCORE_THROUGHPUT_4C; + result.insLatency = PERFSCORE_LATENCY_6C; + } + else + { + // B/H/S + result.insThroughput = PERFSCORE_THROUGHPUT_2C; + result.insLatency = PERFSCORE_LATENCY_4C; + } + break; + + case INS_st1: + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + result.insLatency = PERFSCORE_LATENCY_1C; + break; + + case INS_st1_2regs: + case INS_st2: + if (id->idOpSize() == EA_8BYTE) + { + // D-form + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + result.insLatency = PERFSCORE_LATENCY_1C; + } + else + { + // Q-form + assert(id->idOpSize() == EA_16BYTE); + result.insThroughput = PERFSCORE_THROUGHPUT_2C; + result.insLatency = PERFSCORE_LATENCY_2C; + } + break; + + case INS_st1_3regs: + if (id->idOpSize() == EA_8BYTE) + { + // D-form + result.insThroughput = PERFSCORE_THROUGHPUT_2C; + result.insLatency = PERFSCORE_LATENCY_2C; + } + else + { + // Q-form + assert(id->idOpSize() == EA_16BYTE); + result.insThroughput = PERFSCORE_THROUGHPUT_3C; + result.insLatency = PERFSCORE_LATENCY_3C; + } + break; + + case INS_st1_4regs: + if (id->idOpSize() == EA_8BYTE) + { + // D-form + result.insThroughput = PERFSCORE_THROUGHPUT_2C; + result.insLatency = PERFSCORE_LATENCY_2C; + } + else + { + // Q-form + assert(id->idOpSize() == EA_16BYTE); + result.insThroughput = PERFSCORE_THROUGHPUT_4C; + result.insLatency = PERFSCORE_LATENCY_4C; + } + break; + + case INS_st3: + result.insThroughput = PERFSCORE_THROUGHPUT_3C; + result.insLatency = PERFSCORE_LATENCY_3C; + break; + + case INS_st4: + if (id->idOpSize() == EA_8BYTE) + { + // D-form + result.insThroughput = PERFSCORE_THROUGHPUT_3C; + result.insLatency = PERFSCORE_LATENCY_3C; + } + else + { + assert(id->idOpSize() == EA_16BYTE); + if (optGetElemsize(id->idInsOpt()) == EA_8BYTE) + { + // D + result.insThroughput = PERFSCORE_THROUGHPUT_4C; + result.insLatency = PERFSCORE_LATENCY_4C; + } + else + { + // B/H/S + result.insThroughput = PERFSCORE_THROUGHPUT_5C; + result.insLatency = PERFSCORE_LATENCY_5C; + } + } + break; + + default: + unreached(); } - else + break; + + case IF_LS_2F: + case IF_LS_2G: + case IF_LS_3G: + // Load/Store single structure + switch (ins) { - // Q-form - assert(id->idOpSize() == EA_16BYTE); - result.insThroughput = PERFSCORE_THROUGHPUT_2C; - result.insLatency = PERFSCORE_LATENCY_4C; + case INS_ld1: + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + result.insLatency = PERFSCORE_LATENCY_3C; + break; + + case INS_ld2: + if (id->idOpSize() == EA_8BYTE) + { + // D + result.insThroughput = PERFSCORE_THROUGHPUT_2C; + result.insLatency = PERFSCORE_LATENCY_4C; + } + else + { + // B/H/S + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + result.insLatency = PERFSCORE_LATENCY_3C; + } + break; + + case INS_ld3: + if (id->idOpSize() == EA_8BYTE) + { + // D + result.insThroughput = PERFSCORE_THROUGHPUT_3C; + result.insLatency = PERFSCORE_LATENCY_5C; + } + else + { + // B/H/S + result.insThroughput = PERFSCORE_THROUGHPUT_2C; + result.insLatency = PERFSCORE_LATENCY_4C; + } + break; + + case INS_ld4: + if (id->idOpSize() == EA_8BYTE) + { + // D + result.insThroughput = PERFSCORE_THROUGHPUT_4C; + result.insLatency = PERFSCORE_LATENCY_6C; + } + else + { + // B/H/S + result.insThroughput = PERFSCORE_THROUGHPUT_2C; + result.insLatency = PERFSCORE_LATENCY_4C; + } + break; + + case INS_st1: + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + result.insLatency = PERFSCORE_LATENCY_1C; + break; + + case INS_st2: + if (id->idOpSize() == EA_8BYTE) + { + // D + result.insThroughput = PERFSCORE_THROUGHPUT_2C; + result.insLatency = PERFSCORE_LATENCY_2C; + } + else + { + // B/H/S + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + result.insLatency = PERFSCORE_LATENCY_1C; + } + break; + + case INS_st3: + case INS_st4: + result.insThroughput = PERFSCORE_THROUGHPUT_2C; + result.insLatency = PERFSCORE_LATENCY_2C; + break; + + default: + unreached(); } break; diff --git a/src/coreclr/src/jit/emitarm64.h b/src/coreclr/src/jit/emitarm64.h index 4bdd715b4b23b..f6286a6062360 100644 --- a/src/coreclr/src/jit/emitarm64.h +++ b/src/coreclr/src/jit/emitarm64.h @@ -39,7 +39,10 @@ void emitDispLSExtendOpts(insOpts opt); void emitDispReg(regNumber reg, emitAttr attr, bool addComma); void emitDispVectorReg(regNumber reg, insOpts opt, bool addComma); void emitDispVectorRegIndex(regNumber reg, emitAttr elemsize, ssize_t index, bool addComma); +void emitDispVectorRegList(regNumber firstReg, unsigned listSize, insOpts opt, bool addComma); +void emitDispVectorElemList(regNumber firstReg, unsigned listSize, emitAttr elemsize, unsigned index, bool addComma); void emitDispArrangement(insOpts opt); +void emitDispElemsize(emitAttr elemsize); void emitDispShiftedReg(regNumber reg, insOpts opt, ssize_t imm, emitAttr attr); void emitDispExtendReg(regNumber reg, insOpts opt, ssize_t imm); void emitDispAddrRI(regNumber reg, insOpts opt, ssize_t imm); @@ -445,6 +448,10 @@ static emitAttr optGetSrcsize(insOpts conversion); // for an element of size 'elemsize' in a vector register of size 'datasize' static bool isValidVectorIndex(emitAttr datasize, emitAttr elemsize, ssize_t index); +// For a given Load/Store Vector instruction 'ins' returns a number of consecutive SIMD registers +// the instruction loads to/store from. +static unsigned insGetLoadStoreRegisterListSize(instruction ins); + /************************************************************************/ /* Public inline informational methods */ /************************************************************************/ @@ -737,7 +744,8 @@ void emitIns_R_R_R_Ext(instruction ins, insOpts opt = INS_OPTS_NONE, int shiftAmount = -1); -void emitIns_R_R_I_I(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, int imm1, int imm2); +void emitIns_R_R_I_I( + instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, int imm1, int imm2, insOpts opt = INS_OPTS_NONE); void emitIns_R_R_R_R(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, regNumber reg3, regNumber reg4); diff --git a/src/coreclr/src/jit/emitfmtsarm64.h b/src/coreclr/src/jit/emitfmtsarm64.h index f8c52fea3a0dc..c0fd69edc05e5 100644 --- a/src/coreclr/src/jit/emitfmtsarm64.h +++ b/src/coreclr/src/jit/emitfmtsarm64.h @@ -48,6 +48,7 @@ IF_DEF(LARGELDC, IS_NONE, JMP) // large constant pseudo-op (adrp + ldr) IF_DEF(EN9, IS_NONE, NONE) // Instruction has 9 possible encoding types IF_DEF(EN6A, IS_NONE, NONE) // Instruction has 6 possible encoding types, type A +IF_DEF(EN6B, IS_NONE, NONE) // Instruction has 6 possible encoding types, type B IF_DEF(EN5A, IS_NONE, NONE) // Instruction has 5 possible encoding types, type A IF_DEF(EN5B, IS_NONE, NONE) // Instruction has 5 possible encoding types, type B IF_DEF(EN5C, IS_NONE, NONE) // Instruction has 5 possible encoding types, type C @@ -60,7 +61,6 @@ IF_DEF(EN4F, IS_NONE, NONE) // Instruction has 4 possible encoding types, type F IF_DEF(EN4G, IS_NONE, NONE) // Instruction has 4 possible encoding types, type G IF_DEF(EN4H, IS_NONE, NONE) // Instruction has 4 possible encoding types, type H IF_DEF(EN4I, IS_NONE, NONE) // Instruction has 4 possible encoding types, type I -IF_DEF(EN4J, IS_NONE, NONE) // Instruction has 4 possible encoding types, type J IF_DEF(EN3A, IS_NONE, NONE) // Instruction has 3 possible encoding types, type A IF_DEF(EN3B, IS_NONE, NONE) // Instruction has 3 possible encoding types, type B IF_DEF(EN3C, IS_NONE, NONE) // Instruction has 3 possible encoding types, type C @@ -70,6 +70,7 @@ IF_DEF(EN3F, IS_NONE, NONE) // Instruction has 3 possible encoding types, type F IF_DEF(EN3G, IS_NONE, NONE) // Instruction has 3 possible encoding types, type G IF_DEF(EN3H, IS_NONE, NONE) // Instruction has 3 possible encoding types, type H IF_DEF(EN3I, IS_NONE, NONE) // Instruction has 3 possible encoding types, type I +IF_DEF(EN3J, IS_NONE, NONE) // Instruction has 3 possible encoding types, type J IF_DEF(EN2A, IS_NONE, NONE) // Instruction has 2 possible encoding types, type A IF_DEF(EN2B, IS_NONE, NONE) // Instruction has 2 possible encoding types, type B IF_DEF(EN2C, IS_NONE, NONE) // Instruction has 2 possible encoding types, type C @@ -133,15 +134,20 @@ IF_DEF(LS_1A, IS_NONE, JMP) // LS_1A XX...V..iiiiiiii iiiiiiiiiiittttt R IF_DEF(LS_2A, IS_NONE, NONE) // LS_2A .X.......X...... ......nnnnnttttt Rt Rn IF_DEF(LS_2B, IS_NONE, NONE) // LS_2B .X.......Xiiiiii iiiiiinnnnnttttt Rt Rn imm(0-4095) IF_DEF(LS_2C, IS_NONE, NONE) // LS_2C .X.......X.iiiii iiiiP.nnnnnttttt Rt Rn imm(-256..+255) pre/post inc -IF_DEF(LS_2D, IS_NONE, NONE) // LS_2D .Q.............. xx.xssnnnnnttttt Rn Vt -IF_DEF(LS_2E, IS_NONE, NONE) // LS_2E .Q.............. xx.Sssnnnnnttttt Rn Vt[] +IF_DEF(LS_2D, IS_NONE, NONE) // LS_2D .Q.............. ....ssnnnnnttttt Vt Rn Load/Store multiple structures base register + // Load single structure and replicate base register +IF_DEF(LS_2E, IS_NONE, NONE) // LS_2E .Q.............. ....ssnnnnnttttt Vt Rn Load/Store multiple structures post-indexed by an immediate + // Load single structure and replicate post-indexed by an immediate +IF_DEF(LS_2F, IS_NONE, NONE) // LS_2F .Q.............. ...Sssnnnnnttttt Vt[] Rn Load/Store single structure base register +IF_DEF(LS_2G, IS_NONE, NONE) // LS_2G .Q.............. ...Sssnnnnnttttt Vt[] Rn Load/Store single structure post-indexed by an immediate IF_DEF(LS_3A, IS_NONE, NONE) // LS_3A .X.......X.mmmmm xxxS..nnnnnttttt Rt Rn Rm ext(Rm) LSL {} IF_DEF(LS_3B, IS_NONE, NONE) // LS_3B X............... .aaaaannnnnddddd Rd Ra Rn IF_DEF(LS_3C, IS_NONE, NONE) // LS_3C X.........iiiiii iaaaaannnnnddddd Rd Ra Rn imm(im7,sh) IF_DEF(LS_3D, IS_NONE, NONE) // LS_3D .X.......X.mmmmm ......nnnnnttttt Wm Rt Rn IF_DEF(LS_3E, IS_NONE, NONE) // LS_3E .X.........mmmmm ......nnnnnttttt Rm Rt Rn ARMv8.1 LSE Atomics -IF_DEF(LS_3F, IS_NONE, NONE) // LS_3F .Q.........mmmmm xx.xssnnnnnttttt Rm Rn Vt -IF_DEF(LS_3G, IS_NONE, NONE) // LS_3G .Q.........mmmmm xx.Sssnnnnnttttt Rm Rn Vt[] +IF_DEF(LS_3F, IS_NONE, NONE) // LS_3F .Q.........mmmmm ....ssnnnnnttttt Vt Rn Rm Load/Store multiple structures post-indexed by a register + // Load single structure and replicate post-indexed by a register +IF_DEF(LS_3G, IS_NONE, NONE) // LS_3G .Q.........mmmmm ...Sssnnnnnttttt Vt[] Rn Rm Load/Store single structure post-indexed by a register IF_DEF(DI_1A, IS_NONE, NONE) // DI_1A X.......shiiiiii iiiiiinnnnn..... Rn imm(i12,sh) IF_DEF(DI_1B, IS_NONE, NONE) // DI_1B X........hwiiiii iiiiiiiiiiiddddd Rd imm(i16,hw) diff --git a/src/coreclr/src/jit/instrsarm64.h b/src/coreclr/src/jit/instrsarm64.h index 2eadae317324f..6958660ac68a3 100644 --- a/src/coreclr/src/jit/instrsarm64.h +++ b/src/coreclr/src/jit/instrsarm64.h @@ -85,6 +85,71 @@ INST6(sub, "sub", 0, 0, IF_EN6A, 0x4B000000, 0x4B000000, 0x4B200000, // sub Vd,Vn,Vm DV_3A 0Q101110XX1mmmmm 100001nnnnnddddd 2E20 8400 Vd,Vn,Vm (vector) // sub Vd,Vn,Vm DV_3E 01111110111mmmmm 100001nnnnnddddd 7EE0 8400 Vd,Vn,Vm (scalar) +// enum name FP LD/ST LS_2D LS_3F LS_2E LS_2F LS_3G LS_2G +INST6(ld1, "ld1", 0, LD, IF_EN6B, 0x0C407000, 0x0CC07000, 0x0CDF7000, 0x0D400000, 0x0DC00000, 0x0DDF0000) + // ld1 {Vt},[Xn] LS_2D 0Q00110001000000 0111ssnnnnnttttt 0C40 7000 base register + // ld1 {Vt},[Xn],Xm LS_3F 0Q001100110mmmmm 0111ssnnnnnttttt 0CC0 7000 post-indexed by a register + // ld1 {Vt},[Xn],#imm LS_2E 0Q00110011011111 0111ssnnnnnttttt 0CDF 7000 post-indexed by an immediate + // ld1 {Vt}[],[Xn] LS_2F 0Q00110101000000 xx0Sssnnnnnttttt 0D40 0000 base register + // ld1 {Vt}[],[Xn],Xm LS_3G 0Q001101110mmmmm xx0Sssnnnnnttttt 0DC0 0000 post-indexed by a register + // ld1 {Vt}[],[Xn],#imm LS_2G 0Q00110111011111 xx0Sssnnnnnttttt 0DDF 0000 post-indexed by an immediate + +INST6(ld2, "ld2", 0, LD, IF_EN6B, 0x0C408000, 0x0CC08000, 0x0CDF8000, 0x0D600000, 0x0DE00000, 0x0DFF0000) + // ld2 {Vt,Vt2},[Xn] LS_2D 0Q00110001000000 1000ssnnnnnttttt 0C40 8000 base register + // ld2 {Vt,Vt2},[Xn],Xm LS_3F 0Q001100110mmmmm 1000ssnnnnnttttt 0CC0 8000 post-indexed by a register + // ld2 {Vt,Vt2},[Xn],#imm LS_2E 0Q001100110mmmmm 1000ssnnnnnttttt 0CDF 8000 post-indexed by an immediate + // ld2 {Vt,Vt2}[],[Xn] LS_2F 0Q00110101100000 xx0Sssnnnnnttttt 0D60 0000 base register + // ld2 {Vt,Vt2}[],[Xn],Xm LS_3G 0Q001101111mmmmm xx0Sssnnnnnttttt 0DE0 0000 post-indexed by a register + // ld2 {Vt,Vt2}[],[Xn],#imm LS_2G 0Q00110111111111 xx0Sssnnnnnttttt 0DFF 0000 post-indexed by an immediate + +INST6(ld3, "ld3", 0, LD, IF_EN6B, 0x0C404000, 0x0CC04000, 0x0CDF4000, 0x0D402000, 0x0DC02000, 0x0DDF2000) + // ld3 {Vt-Vt3},[Xn] LS_2D 0Q00110001000000 0100ssnnnnnttttt 0C40 4000 base register + // ld3 {Vt-Vt3},[Xn],Xm LS_3F 0Q001100110mmmmm 0100ssnnnnnttttt 0CC0 4000 post-indexed by a register + // ld3 {Vt-Vt3},[Xn],#imm LS_2E 0Q001100110mmmmm 0100ssnnnnnttttt 0CDF 4000 post-indexed by an immediate + // ld3 {Vt-Vt3}[],[Xn] LS_2F 0Q00110101000000 xx1Sssnnnnnttttt 0D40 2000 base register + // ld3 {Vt-Vt3}[],[Xn],Xm LS_3G 0Q001101110mmmmm xx1Sssnnnnnttttt 0DC0 2000 post-indexed by a register + // ld3 {Vt-Vt3}[],[Xn],#imm LS_2G 0Q00110111011111 xx1Sssnnnnnttttt 0DDF 2000 post-indexed by an immediate + +INST6(ld4, "ld4", 0, LD, IF_EN6B, 0x0C400000, 0x0CC00000, 0x0CDF0000, 0x0D602000, 0x0DE02000, 0x0DFF2000) + // ld4 {Vt-Vt4},[Xn] LS_2D 0Q00110001000000 0000ssnnnnnttttt 0C40 0000 base register + // ld4 {Vt-Vt4},[Xn],Xm LS_3F 0Q001100110mmmmm 0000ssnnnnnttttt 0CC0 0000 post-indexed by a register + // ld4 {Vt-Vt4},[Xn],#imm LS_2E 0Q00110011011111 0000ssnnnnnttttt 0CDF 0000 post-indexed by an immediate + // ld4 {Vt-Vt4}[],[Xn] LS_2F 0Q00110101100000 xx1Sssnnnnnttttt 0D60 2000 base register + // ld4 {Vt-Vt4}[],[Xn],Xm LS_3G 0Q001101111mmmmm xx1Sssnnnnnttttt 0DE0 2000 post-indexed by a register + // ld4 {Vt-Vt4}[],[Xn],#imm LS_2G 0Q00110111111111 xx1Sssnnnnnttttt 0DFF 2000 post-indexed by an immediate + +INST6(st1, "st1", 0, LD, IF_EN6B, 0x0C007000, 0x0C807000, 0x0C9F7000, 0x0D000000, 0x0D800000, 0x0D9F0000) + // st1 {Vt},[Xn] LS_2D 0Q00110000000000 0111ssnnnnnttttt 0C00 7000 base register + // st1 {Vt},[Xn],Xm LS_3F 0Q001100100mmmmm 0111ssnnnnnttttt 0C80 7000 post-indexed by a register + // st1 {Vt},[Xn],#imm LS_2E 0Q00110010011111 0111ssnnnnnttttt 0C9F 7000 post-indexed by an immediate + // st1 {Vt}[],[Xn] LS_2F 0Q00110100000000 xx0Sssnnnnnttttt 0D00 0000 base register + // st1 {Vt}[],[Xn],Xm LS_3G 0Q001101100mmmmm xx0Sssnnnnnttttt 0D80 0000 post-indexed by a register + // st1 {Vt}[],[Xn],#imm LS_2G 0Q00110110011111 xx0Sssnnnnnttttt 0D9F 0000 post-indexed by an immediate + +INST6(st2, "st2", 0, ST, IF_EN6B, 0x0C008000, 0x0C808000, 0x0C9F8000, 0x0D200000, 0x0DA00000, 0x0DBF0000) + // st2 {Vt,Vt2},[Xn] LS_2D 0Q00110000000000 1000ssnnnnnttttt 0C00 8000 base register + // st2 {Vt,Vt2},[Xn],Xm LS_3F 0Q001100100mmmmm 1000ssnnnnnttttt 0C80 8000 post-indexed by a register + // st2 {Vt,Vt2},[Xn],#imm LS_2E 0Q00110010011111 1000ssnnnnnttttt 0C9F 8000 post-indexed by an immediate + // st2 {Vt,Vt2}[],[Xn] LS_2F 0Q00110100100000 xx0Sssnnnnnttttt 0D20 0000 base register + // st2 {Vt,Vt2}[],[Xn],Xm LS_3G 0Q001101101mmmmm xx0Sssnnnnnttttt 0DA0 0000 post-indexed by a register + // st2 {Vt,Vt2}[],[Xn],#imm LS_2G 0Q00110110111111 xx0Sssnnnnnttttt 0DBF 0000 post-indexed by an immediate + +INST6(st3, "st3", 0, ST, IF_EN6B, 0x0C004000, 0x0C804000, 0x0C9F4000, 0x0D002000, 0x0D802000, 0x0D9F2000) + // st3 {Vt-Vt3},[Xn] LS_2D 0Q00110000000000 0100ssnnnnnttttt 0C00 4000 base register + // st3 {Vt-Vt3},[Xn],Xm LS_3F 0Q001100100mmmmm 0100ssnnnnnttttt 0C80 4000 post-indexed by a register + // st3 {Vt-Vt3},[Xn],#imm LS_2E 0Q00110010011111 0100ssnnnnnttttt 0C9F 4000 post-indexed by an immediate + // st3 {Vt-Vt3}[],[Xn] LS_2F 0Q00110100000000 xx1Sssnnnnnttttt 0D00 2000 base register + // st3 {Vt-Vt3}[],[Xn],Xm LS_3G 0Q001101100mmmmm xx1Sssnnnnnttttt 0D80 2000 post-indexed by a register + // st3 {Vt-Vt3}[],[Xn],#imm LS_2G 0Q00110110011111 xx1Sssnnnnnttttt 0D9F 2000 post-indexed by an immediate + +INST6(st4, "st4", 0, ST, IF_EN6B, 0x0C000000, 0x0C800000, 0x0C9F0000, 0x0D202000, 0x0DA02000, 0x0DBF2000) + // st4 {Vt-Vt4},[Xn] LS_2D 0Q00110000000000 0000ssnnnnnttttt 0C00 0000 base register + // st4 {Vt-Vt4},[Xn],Xm LS_3F 0Q001100100mmmmm 0000ssnnnnnttttt 0C80 0000 post-indexed by a register + // st4 {Vt-Vt4},[Xn],#imm LS_2E 0Q00110010011111 0000ssnnnnnttttt 0C9F 0000 post-indexed by an immediate + // st4 {Vt-Vt4}[],[Xn] LS_2F 0Q00110100100000 xx1Sssnnnnnttttt 0D20 2000 base register + // st4 {Vt-Vt4}[],[Xn],Xm LS_3G 0Q001101101mmmmm xx1Sssnnnnnttttt 0DA0 2000 post-indexed by a register + // st4 {Vt-Vt4}[],[Xn],#imm LS_2G 0Q00110110111111 xx1Sssnnnnnttttt 0DBF 2000 post-indexed by an immediate + // enum name FP LD/ST LS_2A LS_2B LS_2C LS_3A LS_1A INST5(ldr, "ldr", 0,LD, IF_EN5A, 0xB9400000, 0xB9400000, 0xB8400000, 0xB8600800, 0x18000000) // ldr Rt,[Xn] LS_2A 1X11100101000000 000000nnnnnttttt B940 0000 @@ -263,13 +328,6 @@ INST4(fcmgt, "fcmgt", 0, 0, IF_EN4I, 0x7EA0E400, 0x2EA0E400, 0x5EA0C800, // fcmgt Vd,Vn DV_2G 010111101X100000 110010nnnnnddddd 5EA0 E800 Vd Vn (scalar) // fcmgt Vd,Vn DV_2A 0Q0011101X100000 110010nnnnnddddd 0EA0 C800 Vd Vn (vector) -// enum name FP LD/ST LS_2D LS_3F LS_2E LS_3G -INST4(ld1, "ld1", 0, LD,IF_EN4J, 0x0C402000, 0x0CC02000, 0x0D400000, 0x0DC00000) - // ld1 Vd,Rn LS_2D 0Q00110001000000 xx1xssnnnnnttttt 0C40 2000 Vd,Rn (vector - multiple structures) - // ld1 Vd,Rn,Rm LS_3F 0Q001100110mmmmm xx1xssnnnnnttttt 0CC0 2000 Vd,Rn,Rm (vector - multiple structures) - // ld1 Vd[],Rn LS_2E 0Q00110101000000 xx0Sssnnnnnttttt 0D40 0000 Vd[],Rn (vector - single structure) - // ld1 Vd[],Rn,Rm LS_3G 0Q001101110mmmmm xx0Sssnnnnnttttt 0DC0 0000 Vd[],Rn,Rm (vector - single structure) - // enum name FP LD/ST DR_3A DR_3B DI_2C INST3(ands, "ands", 0, 0, IF_EN3A, 0x6A000000, 0x6A000000, 0x72000000) // ands Rd,Rn,Rm DR_3A X1101010000mmmmm 000000nnnnnddddd 6A00 0000 @@ -378,6 +436,56 @@ INST3(mvn, "mvn", 0, 0, IF_EN3I, 0x2A2003E0, 0x2A2003E0, 0x2E205800) // mvn Rd,(Rm,shk,imm) DR_2F X0101010sh1mmmmm iiiiii11111ddddd 2A20 03E0 Rm {LSL,LSR,ASR} imm(0-63) // mvn Vd,Vn DV_2M 0Q10111000100000 010110nnnnnddddd 2E20 5800 Vd,Vn (vector) +// enum name FP LD/ST LS_2D LS_3F LS_2E +INST3(ld1_2regs,"ld1", 0,LD, IF_EN3J, 0x0C40A000, 0x0CC0A000, 0x0CDFA000) + // ld1 {Vt,Vt2},[Xn] LS_2D 0Q00110001000000 1010ssnnnnnttttt 0C40 A000 base register + // ld1 {Vt,Vt2},[Xn],Xm LS_3F 0Q001100110mmmmm 1010ssnnnnnttttt 0CC0 A000 post-indexed by a register + // ld1 {Vt,Vt2},[Xn],#imm LS_2E 0Q00110011011111 1010ssnnnnnttttt 0CDF A000 post-indexed by an immediate + +INST3(ld1_3regs,"ld1", 0,LD, IF_EN3J, 0x0C406000, 0x0CC06000, 0x0CDF6000) + // ld1 {Vt-Vt3},[Xn] LS_2D 0Q00110001000000 0110ssnnnnnttttt 0C40 6000 base register + // ld1 {Vt-Vt3},[Xn],Xm LS_3F 0Q001100110mmmmm 0110ssnnnnnttttt 0CC0 6000 post-indexed by a register + // ld1 {Vt-Vt3},[Xn],#imm LS_2E 0Q00110011011111 0110ssnnnnnttttt 0CDF 6000 post-indexed by an immediate + +INST3(ld1_4regs,"ld1", 0,LD, IF_EN3J, 0x0C402000, 0x0CC02000, 0x0CDF2000) + // ld1 {Vt-Vt4},[Xn] LS_2D 0Q00110001000000 0010ssnnnnnttttt 0C40 2000 base register + // ld1 {Vt-Vt4},[Xn],Xm LS_3F 0Q001100110mmmmm 0010ssnnnnnttttt 0CC0 2000 post-indexed by a register + // ld1 {Vt-Vt4},[Xn],#imm LS_2E 0Q00110011011111 0010ssnnnnnttttt 0CDF 2000 post-indexed by an immediate + +INST3(st1_2regs,"st1", 0,ST, IF_EN3J, 0x0C00A000, 0x0C80A000, 0x0C9FA000) + // st1 {Vt,Vt2},[Xn] LS_2D 0Q00110000000000 1010ssnnnnnttttt 0C00 A000 base register + // st1 {Vt,Vt2},[Xn],Xm LS_3F 0Q001100100mmmmm 1010ssnnnnnttttt 0C80 A000 post-indexed by a register + // st1 {Vt,Vt2},[Xn],#imm LS_2E 0Q00110010011111 1010ssnnnnnttttt 0C9F A000 post-indexed by an immediate + +INST3(st1_3regs,"st1", 0,ST, IF_EN3J, 0x0C006000, 0x0C806000, 0x0C9F6000) + // st1 {Vt-Vt3},[Xn] LS_2D 0Q00110000000000 0110ssnnnnnttttt 0C00 6000 base register + // st1 {Vt-Vt3},[Xn],Xm LS_3F 0Q001100100mmmmm 0110XXnnnnnttttt 0C80 6000 post-indexed by a register + // st1 {Vt-Vt3},[Xn],#imm LS_2E 0Q00110010011111 0110XXnnnnnttttt 0C9F 6000 post-indexed by an immediate + +INST3(st1_4regs,"st1", 0,ST, IF_EN3J, 0x0C002000, 0x0C802000, 0x0C9F2000) + // st1 {Vt-Vt4},[Xn] LS_2D 0Q00110000000000 0010XXnnnnnttttt 0C00 2000 base register + // st1 {Vt-Vt4},[Xn],Xm LS_3F 0Q001100100mmmmm 0010XXnnnnnttttt 0C80 2000 post-indexed by a register + // st1 {Vt-Vt4},[Xn],#imm LS_2E 0Q00110010011111 0010XXnnnnnttttt 0C9F 2000 post-indexed by an immediate + +INST3(ld1r, "ld1r", 0,LD, IF_EN3J, 0x0D40C000, 0x0DC0C000, 0x0DDFC000) + // ld1r {Vt},[Xn] LS_2D 0Q00110101000000 1100ssnnnnnttttt 0D40 C000 base register + // ld1r {Vt},[Xn],Xm LS_3F 0Q001101110mmmmm 1100ssnnnnnttttt 0DC0 C000 post-indexed by a register + // ld1r {Vt},[Xn],#1 LS_2E 0Q00110111011111 1100ssnnnnnttttt 0DDF C000 post-indexed by an immediate + +INST3(ld2r, "ld2r", 0,LD, IF_EN3J, 0x0D60C000, 0x0DE0C000, 0x0DFFC000) + // ld2r {Vt,Vt2},[Xn] LS_2D 0Q00110101100000 1100ssnnnnnttttt 0D60 C000 base register + // ld2r {Vt,Vt2},[Xn],Xm LS_3F 0Q001101111mmmmm 1100ssnnnnnttttt 0DE0 C000 post-indexed by a register + // ld2r {Vt,Vt2},[Xn],#2 LS_2E 0Q00110111111111 1100ssnnnnnttttt 0DFF C000 post-indexed by an immediate + +INST3(ld3r, "ld3r", 0,LD, IF_EN3J, 0x0D40E000, 0x0DC0E000, 0x0DDFE000) + // ld3r {Vt-Vt3},[Xn] LS_2D 0Q00110101000000 1110ssnnnnnttttt 0D40 E000 base register + // ld3r {Vt-Vt3},[Xn],Xm LS_3F 0Q001101110mmmmm 1110ssnnnnnttttt 0DC0 E000 post-indexed by a register + // ld3r {Vt-Vt3},[Xn],#4 LS_2E 0Q00110111011111 1110ssnnnnnttttt 0DDF E000 post-indexed by an immediate + +INST3(ld4r, "ld4r", 0,LD, IF_EN3J, 0x0D60E000, 0x0DE0E000, 0x0DFFE000) + // ld4r {Vt-Vt4},[Xn] LS_2D 0Q00110101100000 1110ssnnnnnttttt 0D60 E000 base register + // ld4r {Vt-Vt4},[Xn],Xm LS_3F 0Q001101111mmmmm 1110ssnnnnnttttt 0DE0 E000 post-indexed by a register + // ld4r {Vt-Vt4},[Xn],#8 LS_2E 0Q00110111111111 1110ssnnnnnttttt 0DFF E000 post-indexed by an immediate // enum name FP LD/ST DR_2E DR_2F INST2(negs, "negs", 0, 0, IF_EN2A, 0x6B0003E0, 0x6B0003E0)