Refactor vector instructions again

Argument order is sane again!
saiminhtet · May 16, 2020 · f63b888 · f63b888
1 parent 3795b28
commit f63b888
Show file tree

Hide file tree

Showing 6 changed files with 170 additions and 157 deletions.
diff --git a/emu/decode.h b/emu/decode.h
@@ -59,9 +59,9 @@ __no_instrument DECODER_RET glue(DECODER_NAME, OP_SIZE)(DECODER_ARGS) {
                 case 0x18 ... 0x1f: TRACEI("nop modrm\t"); READMODRM; break;
 
                 case 0x28: TRACEI("movaps xmm:modrm, xmm");
-                           READMODRM; VLOAD(xmm_modrm_val, xmm_modrm_reg,128); break;
+                           READMODRM; VMOV(xmm_modrm_val, xmm_modrm_reg,128); break;
                 case 0x29: TRACEI("movaps xmm, xmm:modrm");
-                           READMODRM; VSTORE(xmm_modrm_reg, xmm_modrm_val,128); break;
+                           READMODRM; VMOV(xmm_modrm_reg, xmm_modrm_val,128); break;
 
                 case 0x2e: TRACEI("ucomiss xmm, xmm:modrm");
                            READMODRM; VCOMPARE(xmm_modrm_val, xmm_modrm_reg,32);
@@ -267,11 +267,11 @@ __no_instrument DECODER_RET glue(DECODER_NAME, OP_SIZE)(DECODER_ARGS) {
 
 #if OP_SIZE == 16
                 case 0x6e: TRACEI("movd modrm, xmm");
-                           // TODO: REX.W = 1 might be needed later
-                           READMODRM; VZLOAD(xmm_modrm_val, xmm_modrm_reg,32); break;
+                           // TODO: this is supposed to use general registers!
+                           READMODRM; VMOV(xmm_modrm_val, xmm_modrm_reg,32); break;
 
                 case 0x6f: TRACEI("movdqa xmm:modrm, xmm");
-                           READMODRM; VLOAD(xmm_modrm_val, xmm_modrm_reg,128); break;
+                           READMODRM; VMOV(xmm_modrm_val, xmm_modrm_reg,128); break;
 
                 case 0x73: READMODRM;
                            switch (modrm.opcode) {
@@ -282,19 +282,22 @@ __no_instrument DECODER_RET glue(DECODER_NAME, OP_SIZE)(DECODER_ARGS) {
                            break;
 
                 case 0x7e: TRACEI("movd xmm, modrm");
-                           // TODO: REX.W = 1 might be needed later
-                           READMODRM; VSTORE(xmm_modrm_reg, xmm_modrm_val,32); break;
+                           // TODO: this is supposed to use general registers!
+                           READMODRM; VMOV(xmm_modrm_reg, xmm_modrm_val,32); break;
 
                 case 0x7f: TRACEI("movdqa xmm, xmm:modrm");
-                           READMODRM; VSTORE(xmm_modrm_reg, xmm_modrm_val,128); break;
+                           READMODRM; VMOV(xmm_modrm_reg, xmm_modrm_val,128); break;
+
+                case 0xd6: TRACEI("movq xmm, xmm:modrm");
+                           READMODRM; VMOV(xmm_modrm_reg, xmm_modrm_val,64); break;
 
                 case 0xef: TRACEI("pxor xmm:modrm xmm");
                            READMODRM; VXOR(xmm_modrm_val, xmm_modrm_reg,128); break;
 #else
                 case 0x6f: TRACEI("movq modrm, mm");
-                           READMODRM; VLOAD(mm_modrm_val, mm_modrm_reg, 64); break;
+                           READMODRM; VMOV(mm_modrm_val, mm_modrm_reg, 64); break;
                 case 0x7f: TRACEI("movq mm, modrm");
-                           READMODRM; VSTORE(mm_modrm_reg, mm_modrm_val, 64); break;
+                           READMODRM; VMOV(mm_modrm_reg, mm_modrm_val, 64); break;
 #endif
 
                 default: TRACEI("undefined");
@@ -871,9 +874,9 @@ __no_instrument DECODER_RET glue(DECODER_NAME, OP_SIZE)(DECODER_ARGS) {
                     READINSN;
                     switch (insn) {
                         case 0x10: TRACEI("movsd xmm:modrm, xmm");
-                                   READMODRM; VLOAD_PADMEM(xmm_modrm_val, xmm_modrm_reg,64); break;
+                                   READMODRM; VMOV_MERGE_REG(xmm_modrm_val, xmm_modrm_reg,64); break;
                         case 0x11: TRACEI("movsd xmm, xmm:modrm");
-                                   READMODRM; VSTORE(xmm_modrm_reg, xmm_modrm_val,64); break;
+                                   READMODRM; VMOV_MERGE_REG(xmm_modrm_reg, xmm_modrm_val,64); break;
 
                         case 0x58: TRACEI("addsd xmm:modrm, xmm");
                                    READMODRM; VS_FMATH(add, xmm_modrm_val, xmm_modrm_reg,64); break;
@@ -904,14 +907,14 @@ __no_instrument DECODER_RET glue(DECODER_NAME, OP_SIZE)(DECODER_ARGS) {
                     READINSN;
                     switch (insn) {
                         case 0x10: TRACEI("movss xmm:modrm, xmm");
-                                   READMODRM; VLOAD_PADMEM(xmm_modrm_val, xmm_modrm_reg,32);
+                                   READMODRM; VMOV_MERGE_REG(xmm_modrm_val, xmm_modrm_reg,32);
                                    break;
                         case 0x11: TRACEI("movss xmm, xmm:modrm");
-                                   READMODRM; VSTORE(xmm_modrm_reg, xmm_modrm_val,32);
+                                   READMODRM; VMOV_MERGE_REG(xmm_modrm_reg, xmm_modrm_val,32);
                                    break;
 
                         case 0x7e: TRACEI("movq xmm:modrm, xmm");
-                                   READMODRM; VZLOAD(xmm_modrm_val, xmm_modrm_reg,64);
+                                   READMODRM; VMOV(xmm_modrm_val, xmm_modrm_reg,64);
                                    break;
 
                         case 0x18 ... 0x1f: TRACEI("repz nop modrm\t"); READMODRM; break;

diff --git a/emu/vec.c b/emu/vec.c
@@ -1,13 +1,9 @@
 #include <math.h>
 #include <string.h>
 
+#include "emu/vec.h"
 #include "emu/cpu.h"
 
-/////////////////////////////////////////////
-// See header file for the confusing thing //
-// that is argument ordering in this file  //
-/////////////////////////////////////////////
-
 void vec_compare32(struct cpu_state *cpu, float *f2, float *f1) {
     if (isnan(*f1) || isnan(*f2)) {
         cpu->zf = 1;
@@ -36,60 +32,51 @@ void vec_compare32(struct cpu_state *cpu, float *f2, float *f1) {
     cpu->pf_res = 0;
 }
 
-void vec_load32(struct cpu_state *UNUSED(cpu), const void *src, void *dst) {
-    memcpy(dst, src, 4);
-}
-void vec_load64(struct cpu_state *UNUSED(cpu), const void *src, void *dst) {
-    memcpy(dst, src, 8);
-}
-void vec_load128(struct cpu_state *UNUSED(cpu), const void *src, void *dst) {
-    memcpy(dst, src, 16);
-}
-
 static inline void zero_xmm(union xmm_reg *xmm) {
     xmm->qw[0] = 0;
     xmm->qw[1] = 0;
 }
-#define ZLOAD(sz) \
-void vec_zload##sz(struct cpu_state *cpu, const union xmm_reg *src, union xmm_reg *dst) { \
-    zero_xmm(dst); \
-    vec_load##sz(cpu, src, dst); \
-}
-ZLOAD(32)
-ZLOAD(64)
-ZLOAD(128)
-#undef ZLOAD
 
-void vec_store32(struct cpu_state *UNUSED(cpu), void *dst, void *src) {
+#define VEC_ZERO_COPY(zero, copy) \
+    void vec_zero##zero##_copy##copy(NO_CPU, const void *src, void *dst) { \
+        memset(dst, 0, zero/8); \
+        memcpy(dst, src, copy/8); \
+    }
+VEC_ZERO_COPY(128, 128)
+VEC_ZERO_COPY(128, 64)
+VEC_ZERO_COPY(128, 32)
+VEC_ZERO_COPY(64, 64)
+
+void vec_merge32(NO_CPU, const void *src, void *dst) {
     memcpy(dst, src, 4);
 }
-void vec_store64(struct cpu_state *UNUSED(cpu), void *dst, void *src) {
+void vec_merge64(NO_CPU, const void *src, void *dst) {
     memcpy(dst, src, 8);
 }
-void vec_store128(struct cpu_state *UNUSED(cpu), void *dst, void *src) {
+void vec_merge128(NO_CPU, const void *src, void *dst) {
     memcpy(dst, src, 16);
 }
 
-void vec_imm_shiftr64(struct cpu_state *UNUSED(cpu), const uint8_t amount, union xmm_reg *src) {
+void vec_imm_shiftr64(NO_CPU, const uint8_t amount, union xmm_reg *dst) {
     if (amount > 63) {
-        zero_xmm(src);
+        zero_xmm(dst);
     } else {
-        src->qw[0] >>= amount;
-        src->qw[1] >>= amount;
+        dst->qw[0] >>= amount;
+        dst->qw[1] >>= amount;
     }
 }
 
-void vec_xor128(struct cpu_state *UNUSED(cpu), union xmm_reg *src, union xmm_reg *dst) {
+void vec_xor128(NO_CPU, union xmm_reg *src, union xmm_reg *dst) {
     dst->qw[0] ^= src->qw[0];
     dst->qw[1] ^= src->qw[1];
 }
 
-void vec_fadds64(struct cpu_state *UNUSED(cpu), const double *src, double *dst) {
+void vec_fadds64(NO_CPU, const double *src, double *dst) {
     *dst += *src;
 }
-void vec_fmuls64(struct cpu_state *UNUSED(cpu), const double *src, double *dst) {
+void vec_fmuls64(NO_CPU, const double *src, double *dst) {
     *dst *= *src;
 }
-void vec_fsubs64(struct cpu_state *UNUSED(cpu), const double *src, double *dst) {
+void vec_fsubs64(NO_CPU, const double *src, double *dst) {
     *dst -= *src;
 }
diff --git a/emu/vec.h b/emu/vec.h
@@ -3,45 +3,26 @@
 
 #include "emu/cpu.h"
 
-void vec_compare32(struct cpu_state *UNUSED(cpu), float *f2, float *f1);
-
-/**
- * Argument ordering swaps back and forth because laziness has taken
- * precedence over actual quality. To minimize gadget complicatedness,
- * the second argument is always an XMM. If either arg is memory, the
- * first one is.
- *
- * Corresponding with jit/gen.c:
- * =============================
- * - If v(...) is being used, the first argument is source.
- * - If v_write(...) is being used, the first argument is being written to.
- * Because the first argument is the operand that might be memory.
- * 
- *  jit/gen method | arg order
- * ----------------|------------
- *  v()            | const a, b    
- *  v_write()      | a, const b
- */
-
-void vec_load32(struct cpu_state *UNUSED(cpu), const union xmm_reg *src, union xmm_reg *dst);
-void vec_load64(struct cpu_state *UNUSED(cpu), const union xmm_reg *src, union xmm_reg *dst);
-void vec_load128(struct cpu_state *UNUSED(cpu), const union xmm_reg *src, union xmm_reg *dst);
-
-// Zeroes out the destination before loading.
-// Used in some instructions like movss when the src is memory.
-void vec_zload32(struct cpu_state *UNUSED(cpu), const union xmm_reg *src, union xmm_reg *dst);
-void vec_zload64(struct cpu_state *UNUSED(cpu), const union xmm_reg *src, union xmm_reg *dst);
-void vec_zload128(struct cpu_state *UNUSED(cpu), const union xmm_reg *src, union xmm_reg *dst);
-
-void vec_store32(struct cpu_state *UNUSED(cpu), union xmm_reg *src, const union xmm_reg *dst);
-void vec_store64(struct cpu_state *UNUSED(cpu), union xmm_reg *src, const union xmm_reg *dst);
-void vec_store128(struct cpu_state *UNUSED(cpu), union xmm_reg *src, const union xmm_reg *dst);
-
-void vec_imm_shiftr64(struct cpu_state *UNUSED(cpu), const uint8_t amount, union xmm_reg *src);
-void vec_xor128(struct cpu_state *cpu, union xmm_reg *src, union xmm_reg *dst);
-
-void vec_fadds64(struct cpu_state *cpu, const double *src, double *dst);
-void vec_fmuls64(struct cpu_state *cpu, const double *src, double *dst);
-void vec_fsubs64(struct cpu_state *cpu, const double *src, double *dst);
+#define NO_CPU struct cpu_state *UNUSED(cpu)
+void vec_compare32(NO_CPU, float *f2, float *f1);
+
+// arguments are in src, dst order
+
+void vec_zero128_copy128(NO_CPU, const void *src, void *dst);
+void vec_zero128_copy64(NO_CPU, const void *src, void *dst);
+void vec_zero128_copy32(NO_CPU, const void *src, void *dst);
+void vec_zero64_copy64(NO_CPU, const void *src, void *dst);
+
+// "merge" means don't zero the register before writing to it
+void vec_merge32(NO_CPU, const void *src, void *dst);
+void vec_merge64(NO_CPU, const void *src, void *dst);
+void vec_merge128(NO_CPU, const void *src, void *dst);
+
+void vec_imm_shiftr64(NO_CPU, const uint8_t amount, union xmm_reg *dst);
+void vec_xor128(NO_CPU, union xmm_reg *src, union xmm_reg *dst);
+
+void vec_fadds64(NO_CPU, const double *src, double *dst);
+void vec_fmuls64(NO_CPU, const double *src, double *dst);
+void vec_fsubs64(NO_CPU, const double *src, double *dst);
 
 #endif
diff --git a/jit/gadgets-aarch64/misc.S b/jit/gadgets-aarch64/misc.S
@@ -196,27 +196,41 @@ do_helper 2
         save_c
         mov x0, _cpu
 
-        # r/m argument, first
-        .ifin(\rm, reg)
-            ldrb w1, [_ip, 9]
+        # the argument order should be a consistent src, dst
+        .ifc \rm,reg
+            # src
+            ldrb w1, [_ip, 8]
             add x1, x0, x1
-        .endifin
-        .ifin(\rm, read,write)
+            # dst
+            ldrb w2, [_ip, 9]
+            add x2, x0, x2
+        .endif
+        .ifc \rm,read
+            # src
             mov x1, _xaddr
-        .endifin
-        .ifin(\rm, imm)
-            ldrh w1, [_ip, 9]
-        .endifin
+            # dst
+            ldrb w2, [_ip, 16]
+            add x2, x0, x2
+        .endif
+        .ifc \rm,write
+            # src
+            ldrb w1, [_ip, 16]
+            add x1, x0, x1
+            # dst
+            mov x2, _xaddr
+        .endif
+        .ifc \rm,imm
+            # src
+            ldrh w1, [_ip, 8]
+            # dst
+            ldrb w2, [_ip, 10]
+            add x2, x0, x2
+        .endif
 
-        # reg argument, second
         .ifin(\rm, read,write)
-            ldr x2, [_ip, 16]
-            add x2, x0, x2
             ldr x8, [_ip, 8]
         .endifin
         .ifin(\rm, reg,imm)
-            ldr x2, [_ip, 8]
-            add x2, x0, x2
             ldr x8, [_ip]
         .endifin
         blr x8

diff --git a/jit/gadgets-x86_64/misc.S b/jit/gadgets-x86_64/misc.S
@@ -147,27 +147,41 @@ do_helper 2
         movq %_cpu, %rdi
         xorq %r14, %r14
 
-        # r/m argument, first
-        .ifin(\rm, reg)
-            movb 9(%_ip), %r14b
+        # the argument order should be a consistent src, dst
+        .ifc \rm,reg
+            # src
+            movb 8(%_ip), %r14b
             leaq (%_cpu,%r14), %rsi
-        .endifin
-        .ifin(\rm, read,write)
+            # dst
+            movb 9(%_ip), %r14b
+            leaq (%_cpu,%r14), %rdx
+        .endif
+        .ifc \rm,read
+            # src
             movq %_addrq, %rsi
-        .endifin
-        .ifin(\rm, imm)
-            movb 9(%_ip), %sil
-        .endifin
-
-        # reg argument, second
-        .ifin(\rm, read,write)
+            # dst
             movb 16(%_ip), %r14b
             leaq (%_cpu,%r14), %rdx
+        .endif
+        .ifc \rm,write
+            # src
+            movb 16(%_ip), %r14b
+            leaq (%_cpu,%r14), %rsi
+            # dst
+            movq %_addrq, %rdx
+        .endif
+        .ifc \rm,imm
+            # src
+            movw 8(%_ip), %si
+            # dst
+            movb 10(%_ip), %r14b
+            leaq (%_cpu,%r14), %rdx
+        .endif
+
+        .ifin(\rm, read,write)
             callq *8(%_ip)
         .endifin
         .ifin(\rm, reg,imm)
-            movb 8(%_ip), %r14b
-            leaq (%_cpu,%r14), %rdx
             callq *(%_ip)
         .endifin