Skip to content

Commit

Permalink
Implement a bunch of SSE instructions
Browse files Browse the repository at this point in the history
punpcklbw, pshufd, pcmpeqb, pcmpeqd, pextrw, pmovmskb, movups, pshuflw,
movdqu. These are all used by cargo. There's still some bug where it
gets stuck in a loop, but building cargo with debug symbols takes
forever. Only tested on x86, no guarantees for arm64.

Also added a bunch of vector stuff to the qemu test.
  • Loading branch information
tbodt committed May 25, 2020
1 parent 19a65a7 commit 0b081eb
Show file tree
Hide file tree
Showing 11 changed files with 502 additions and 365 deletions.
5 changes: 3 additions & 2 deletions emu/cpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,11 @@ union mm_reg {
};
union xmm_reg {
qword_t qw[2];
dword_t dw[4];
uint32_t u32[4];
uint16_t u16[8];
uint8_t u8[16];
float f32[4];
double f64[2];
// TODO more forms
};
static_assert(sizeof(union xmm_reg) == 16, "xmm_reg size");
static_assert(sizeof(union mm_reg) == 8, "mm_reg size");
Expand Down
53 changes: 40 additions & 13 deletions emu/decode.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ __no_instrument DECODER_RET glue(DECODER_NAME, OP_SIZE)(DECODER_ARGS) {
#define READIMM8 READIMM_(imm, 8); imm = (int8_t) (uint8_t) imm
#define READIMM16 READIMM_(imm, 16)
#define READMODRM_MEM READMODRM; if (modrm.type == modrm_reg) UNDEFINED
#define READMODRM_NOMEM READMODRM; if (modrm.type != modrm_reg) UNDEFINED

restart:
TRACEIP();
Expand Down Expand Up @@ -266,13 +267,18 @@ __no_instrument DECODER_RET glue(DECODER_NAME, OP_SIZE)(DECODER_ARGS) {
#endif

#if OP_SIZE == 16
case 0x60: TRACEI("punpcklbw xmm:modrm, xmm");
READMODRM; V_OP(unpack_bw, xmm_modrm_val, xmm_modrm_reg,128); break;

case 0x6e: TRACEI("movd modrm, xmm");
// TODO: this is supposed to use general registers!
READMODRM; VMOV(xmm_modrm_val, xmm_modrm_reg,32); break;
READMODRM; VMOV(modrm_val, xmm_modrm_reg,32); break;

case 0x6f: TRACEI("movdqa xmm:modrm, xmm");
READMODRM; VMOV(xmm_modrm_val, xmm_modrm_reg,128); break;

case 0x70: TRACEI("pshufd xmm:modrm, xmm, imm8");
READMODRM; READIMM8; V_OP_IMM(shuffle_d, xmm_modrm_val, xmm_modrm_reg,128); break;

case 0x73: READMODRM;
switch (modrm.opcode) {
case 0x02: TRACEI("psrlq imm, xmm");
Expand All @@ -281,19 +287,34 @@ __no_instrument DECODER_RET glue(DECODER_NAME, OP_SIZE)(DECODER_ARGS) {
}
break;

case 0x74: TRACEI("pcmpeqb xmm:modrm, xmm");
READMODRM; V_OP(compare_eqb, xmm_modrm_val, xmm_modrm_reg,128); break;
case 0x76: TRACEI("pcmpeqd xmm:modrm, xmm");
READMODRM; V_OP(compare_eqd, xmm_modrm_val, xmm_modrm_reg,128); break;

case 0x7e: TRACEI("movd xmm, modrm");
// TODO: this is supposed to use general registers!
READMODRM; VMOV(xmm_modrm_reg, xmm_modrm_val,32); break;
READMODRM; VMOV(xmm_modrm_reg, modrm_val,32); break;

case 0x7f: TRACEI("movdqa xmm, xmm:modrm");
READMODRM; VMOV(xmm_modrm_reg, xmm_modrm_val,128); break;

case 0xc5: TRACEI("pextrw xmm, modrm_val, imm8");
READMODRM; READIMM8; V_OP_IMM(extract_w, xmm_modrm_reg, modrm_val,128); break;

case 0xd6: TRACEI("movq xmm, xmm:modrm");
READMODRM; VMOV(xmm_modrm_reg, xmm_modrm_val,64); break;

case 0xef: TRACEI("pxor xmm:modrm xmm");
case 0xd7: TRACEI("pmovmskb xmm:modrm, reg");
READMODRM_NOMEM; V_OP(movmask_b, xmm_modrm_val, modrm_reg,128); break;

case 0xef: TRACEI("pxor xmm:modrm, xmm");
READMODRM; V_OP(xor, xmm_modrm_val, xmm_modrm_reg,128); break;
#else
case 0x10: TRACEI("movups xmm:modrm, xmm");
READMODRM; VMOV(xmm_modrm_val, xmm_modrm_reg,128); break;
case 0x11: TRACEI("movups xmm, xmm:modrm");
READMODRM; VMOV(xmm_modrm_reg, xmm_modrm_val,128); break;

case 0x6f: TRACEI("movq modrm, mm");
READMODRM; VMOV(mm_modrm_val, mm_modrm_reg, 64); break;
case 0x7f: TRACEI("movq mm, modrm");
Expand Down Expand Up @@ -881,8 +902,8 @@ __no_instrument DECODER_RET glue(DECODER_NAME, OP_SIZE)(DECODER_ARGS) {

case 0x2a: TRACEI("cvtsi2sd modrm, xmm");
READMODRM; V_OP(cvtsi2sd, modrm_val, xmm_modrm_reg,32); break;
case 0x2c: TRACEI("cvtsd2si reg, xmm:modrm");
READMODRM; V_OP(cvtsd2si, xmm_modrm_val, modrm_reg,64); break;
case 0x2c: TRACEI("cvttsd2si reg, xmm:modrm");
READMODRM; V_OP(cvttsd2si, xmm_modrm_val, modrm_reg,64); break;
case 0x5a: TRACEI("cvtsd2ss xmm:modrm, xmm");
READMODRM; V_OP(cvtsd2ss, xmm_modrm_val, xmm_modrm_reg,64); break;

Expand All @@ -895,6 +916,9 @@ __no_instrument DECODER_RET glue(DECODER_NAME, OP_SIZE)(DECODER_ARGS) {
case 0x5e: TRACEI("divsd xmm:modrm, xmm");
READMODRM; V_OP(fdivs, xmm_modrm_val, xmm_modrm_reg,64); break;

case 0x70: TRACEI("pshuflw xmm:modrm, xmm, imm8");
READMODRM; READIMM8; V_OP_IMM(shuffle_lw, xmm_modrm_val, xmm_modrm_reg,128); break;

case 0x18 ... 0x1f: TRACEI("rep nop modrm\t"); READMODRM; break;
default: TRACE("undefined"); UNDEFINED;
}
Expand All @@ -917,18 +941,21 @@ __no_instrument DECODER_RET glue(DECODER_NAME, OP_SIZE)(DECODER_ARGS) {
READINSN;
switch (insn) {
case 0x10: TRACEI("movss xmm:modrm, xmm");
READMODRM; VMOV_MERGE_REG(xmm_modrm_val, xmm_modrm_reg,32);
break;
READMODRM; VMOV_MERGE_REG(xmm_modrm_val, xmm_modrm_reg,32); break;
case 0x11: TRACEI("movss xmm, xmm:modrm");
READMODRM; VMOV_MERGE_REG(xmm_modrm_reg, xmm_modrm_val,32);
break;
READMODRM; VMOV_MERGE_REG(xmm_modrm_reg, xmm_modrm_val,32); break;

case 0x6f: TRACEI("movdqu xmm:modrm, xmm");
READMODRM; VMOV(xmm_modrm_val, xmm_modrm_reg,128); break;

case 0x7e: TRACEI("movq xmm:modrm, xmm");
READMODRM; VMOV(xmm_modrm_val, xmm_modrm_reg,64);
break;
READMODRM; VMOV(xmm_modrm_val, xmm_modrm_reg,64); break;

case 0x18 ... 0x1f: TRACEI("repz nop modrm\t"); READMODRM; break;

case 0x7f: TRACEI("movdqu xmm, xmm:modrm");
READMODRM; VMOV(xmm_modrm_reg, xmm_modrm_val,128); break;

// tzcnt is like bsf but the result when the input is zero is defined as the operand size
// for now, it can just be an alias
case 0xbc: TRACEI("~~tzcnt~~ bsf modrm, reg");
Expand Down
43 changes: 42 additions & 1 deletion emu/vec.c
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ VEC_ZERO_COPY(128, 128)
VEC_ZERO_COPY(128, 64)
VEC_ZERO_COPY(128, 32)
VEC_ZERO_COPY(64, 64)
VEC_ZERO_COPY(32, 32)

void vec_merge32(NO_CPU, const void *src, void *dst) {
memcpy(dst, src, 4);
Expand Down Expand Up @@ -87,9 +88,49 @@ void vec_fdivs64(NO_CPU, const double *src, double *dst) {
void vec_cvtsi2sd32(NO_CPU, const uint32_t *src, double *dst) {
*dst = *src;
}
void vec_cvtsd2si64(NO_CPU, const double *src, uint32_t *dst) {
void vec_cvttsd2si64(NO_CPU, const double *src, uint32_t *dst) {
*dst = *src;
}
void vec_cvtsd2ss64(NO_CPU, const double *src, float *dst) {
*dst = *src;
}

void vec_unpack_bw128(NO_CPU, const union xmm_reg *src, union xmm_reg *dst) {
for (int i = 7; i >= 0; i--) {
dst->u8[i*2 + 1] = src->u8[i];
dst->u8[i*2] = dst->u8[i];
}
}

void vec_shuffle_lw128(NO_CPU, const union xmm_reg *src, union xmm_reg *dst, uint8_t encoding) {
union xmm_reg src_copy = *src;
for (int i = 0; i < 4; i++)
dst->u16[i] = src_copy.u16[(encoding >> (i*2)) % 4];
dst->qw[1] = src->qw[1];
}
void vec_shuffle_d128(NO_CPU, const union xmm_reg *src, union xmm_reg *dst, uint8_t encoding) {
union xmm_reg src_copy = *src;
for (int i = 0; i < 4; i++)
dst->u32[i] = src_copy.u32[(encoding >> (i*2)) % 4];
}

void vec_compare_eqb128(NO_CPU, const union xmm_reg *src, union xmm_reg *dst) {
for (unsigned i = 0; i < array_size(src->u8); i++)
dst->u8[i] = dst->u8[i] == src->u8[i] ? ~0 : 0;
}
void vec_compare_eqd128(NO_CPU, const union xmm_reg *src, union xmm_reg *dst) {
for (unsigned i = 0; i < array_size(src->u32); i++)
dst->u32[i] = dst->u32[i] == src->u32[i] ? ~0 : 0;
}

void vec_movmask_b128(NO_CPU, const union xmm_reg *src, uint32_t *dst) {
*dst = 0;
for (unsigned i = 0; i < array_size(src->u8); i++) {
if (src->u8[i] & (1 << 7))
*dst |= 1 << i;
}
}

void vec_extract_w128(NO_CPU, const union xmm_reg *src, uint32_t *dst, uint8_t index) {
*dst = src->u16[index % 8];
}
13 changes: 11 additions & 2 deletions emu/vec.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ void vec_zero128_copy128(NO_CPU, const void *src, void *dst);
void vec_zero128_copy64(NO_CPU, const void *src, void *dst);
void vec_zero128_copy32(NO_CPU, const void *src, void *dst);
void vec_zero64_copy64(NO_CPU, const void *src, void *dst);

void vec_zero32_copy32(NO_CPU, const void *src, void *dst);
// "merge" means don't zero the register before writing to it
void vec_merge32(NO_CPU, const void *src, void *dst);
void vec_merge64(NO_CPU, const void *src, void *dst);
Expand All @@ -27,7 +27,16 @@ void vec_fsubs64(NO_CPU, const double *src, double *dst);
void vec_fdivs64(NO_CPU, const double *src, double *dst);

void vec_cvtsi2sd32(NO_CPU, const uint32_t *src, double *dst);
void vec_cvtsd2si64(NO_CPU, const double *src, uint32_t *dst);
void vec_cvttsd2si64(NO_CPU, const double *src, uint32_t *dst);
void vec_cvtsd2ss64(NO_CPU, const double *src, float *dst);

// TODO organize
void vec_unpack_bw128(NO_CPU, const union xmm_reg *src, union xmm_reg *dst);
void vec_shuffle_lw128(NO_CPU, const union xmm_reg *src, union xmm_reg *dst, uint8_t encoding);
void vec_shuffle_d128(NO_CPU, const union xmm_reg *src, union xmm_reg *dst, uint8_t encoding);
void vec_compare_eqb128(NO_CPU, const union xmm_reg *src, union xmm_reg *dst);
void vec_compare_eqd128(NO_CPU, const union xmm_reg *src, union xmm_reg *dst);
void vec_movmask_b128(NO_CPU, const union xmm_reg *src, uint32_t *dst);
void vec_extract_w128(NO_CPU, const union xmm_reg *src, uint32_t *dst, uint8_t index);

#endif
35 changes: 24 additions & 11 deletions jit/gadgets-aarch64/misc.S
Original file line number Diff line number Diff line change
Expand Up @@ -188,9 +188,9 @@ do_helper 2
.endr

.macro do_vec_helper rm, size=
.gadget vec_helper_\rm\size
.gadget vec_helper_\rm\size\_imm
.ifin(\rm, read,write)
\rm\()_prep (\size), vec_helper_\rm\size
\rm\()_prep (\size), vec_helper_\rm\size\_imm
.endifin
save_regs
save_c
Expand Down Expand Up @@ -227,6 +227,17 @@ do_helper 2
add x2, x0, x2
.endif

.ifc _imm,_imm
# imm for third argument
.ifin(\rm, reg)
ldr w3, [_ip, 12]
movl 12(%_ip), %ecx
.endifin
.ifin(\rm, read,write)
ldr w3, [_ip, 20]
.endifin
.endif

.ifin(\rm, read,write)
ldr x8, [_ip, 8]
.endifin
Expand All @@ -238,7 +249,7 @@ do_helper 2
restore_c
load_regs
.ifc \rm,write
write_done (\size), vec_helper_\rm\size
write_done (\size), vec_helper_\rm\size\_imm
.endif
.ifin(\rm, reg,imm)
gret 2
Expand All @@ -247,18 +258,20 @@ do_helper 2
gret 3
.endifin
.ifc \rm,read
read_bullshit (\size), vec_helper_\rm\size
read_bullshit (\size), vec_helper_\rm\size\_imm
.else N .ifc \rm,write
write_bullshit (\size), vec_helper_\rm\size
write_bullshit (\size), vec_helper_\rm\size\_imm
.endif N .endif
.endm

.irp rm, reg,imm
do_vec_helper \rm
.endr
.irp size, SIZE_LIST,64,128
do_vec_helper read, \size
do_vec_helper write, \size
.irp _imm, ,_imm
.irp rm, reg,imm
do_vec_helper \rm, \_imm
.endr
.irp size, SIZE_LIST,64,128
do_vec_helper read, \_imm, size
do_vec_helper write, \_imm, \size
.endr
.endr

.gadget fstsw_ax
Expand Down
32 changes: 22 additions & 10 deletions jit/gadgets-x86_64/misc.S
Original file line number Diff line number Diff line change
Expand Up @@ -137,10 +137,10 @@ do_helper 2
do_helper write, \size
.endr

.macro do_vec_helper rm, size=
.gadget vec_helper_\rm\size
.macro do_vec_helper rm, _imm, size=
.gadget vec_helper_\rm\size\_imm
.ifin(\rm, read,write)
\rm\()_prep (\size), vec_helper_\rm\size
\rm\()_prep (\size), vec_helper_\rm\size\_imm
.endifin
save_regs
save_c
Expand Down Expand Up @@ -178,6 +178,16 @@ do_helper 2
leaq (%_cpu,%r14), %rdx
.endif

.ifc _imm,_imm
# imm for third argument
.ifin(\rm, reg)
movl 12(%_ip), %ecx
.endifin
.ifin(\rm, read,write)
movl 20(%_ip), %ecx
.endifin
.endif

.ifin(\rm, read,write)
callq *8(%_ip)
.endifin
Expand All @@ -188,7 +198,7 @@ do_helper 2
restore_c
load_regs
.ifc \rm,write
write_done (\size), vec_helper_\rm\size
write_done (\size), vec_helper_\rm\size\_imm
.endif
.ifin(\rm, reg,imm)
gret 2
Expand All @@ -198,12 +208,14 @@ do_helper 2
.endifin
.endm

.irp rm, reg,imm
do_vec_helper \rm
.endr
.irp size, SIZE_LIST,64,128
do_vec_helper read, \size
do_vec_helper write, \size
.irp _imm, ,_imm
.irp rm, reg,imm
do_vec_helper \rm, \_imm
.endr
.irp size, SIZE_LIST,64,128
do_vec_helper read, \_imm, \size
do_vec_helper write, \_imm, \size
.endr
.endr

.gadget fstsw_ax
Expand Down
Loading

0 comments on commit 0b081eb

Please sign in to comment.