else
printf("skipped\n");
+ printf("%-40s", "Testing movntdqa 16(%edx),%xmm4...");
+ if ( stack_exec && cpu_has_sse4_1 )
+ {
+ decl_insn(movntdqa);
+
+ asm volatile ( "pcmpgtb %%xmm4, %%xmm4\n"
+ put_insn(movntdqa, "movntdqa 16(%0), %%xmm4")
+ :: "d" (NULL) );
+
+ set_insn(movntdqa);
+ memset(res, 0x55, 64);
+ memset(res + 4, 0xff, 16);
+ regs.edx = (unsigned long)res;
+ rc = x86_emulate(&ctxt, &emulops);
+ if ( rc != X86EMUL_OKAY || !check_eip(movntdqa) )
+ goto fail;
+ asm ( "pcmpeqb %%xmm2, %%xmm2\n\t"
+ "pcmpeqb %%xmm4, %%xmm2\n\t"
+ "pmovmskb %%xmm2, %0" : "=r" (rc) );
+ if ( rc != 0xffff )
+ goto fail;
+ printf("okay\n");
+ }
+ else
+ printf("skipped\n");
+
+ printf("%-40s", "Testing vmovntdqa (%ecx),%ymm4...");
+ if ( stack_exec && cpu_has_avx2 )
+ {
+ decl_insn(vmovntdqa);
+
+#if 0 /* Don't use AVX2 instructions for now */
+ asm volatile ( "vpxor %%ymm4, %%ymm4, %%ymm4\n"
+ put_insn(vmovntdqa, "vmovntdqa (%0), %%ymm4")
+ :: "c" (NULL) );
+#else
+ asm volatile ( "vpxor %xmm4, %xmm4, %xmm4\n"
+ put_insn(vmovntdqa,
+ ".byte 0xc4, 0xe2, 0x7d, 0x2a, 0x21") );
+#endif
+
+ set_insn(vmovntdqa);
+ memset(res, 0x55, 96);
+ memset(res + 8, 0xff, 32);
+ regs.ecx = (unsigned long)(res + 8);
+ rc = x86_emulate(&ctxt, &emulops);
+ if ( rc != X86EMUL_OKAY || !check_eip(vmovntdqa) )
+ goto fail;
+#if 0 /* Don't use AVX2 instructions for now */
+ asm ( "vpcmpeqb %%ymm2, %%ymm2, %%ymm2\n\t"
+ "vpcmpeqb %%ymm4, %%ymm2, %%ymm0\n\t"
+ "vpmovmskb %%ymm0, %0" : "=r" (rc) );
+#else
+ asm ( "vextractf128 $1, %%ymm4, %%xmm3\n\t"
+ "vpcmpeqb %%xmm2, %%xmm2, %%xmm2\n\t"
+ "vpcmpeqb %%xmm4, %%xmm2, %%xmm0\n\t"
+ "vpcmpeqb %%xmm3, %%xmm2, %%xmm1\n\t"
+ "vpmovmskb %%xmm0, %0\n\t"
+ "vpmovmskb %%xmm1, %1" : "=r" (rc), "=r" (i) );
+ rc |= i << 16;
+#endif
+ if ( ~rc )
+ goto fail;
+ printf("okay\n");
+ }
+ else
+ printf("skipped\n");
+
printf("%-40s", "Testing stmxcsr (%edx)...");
if ( cpu_has_sse )
{
#define vcpu_has_sse2() vcpu_has( 1, EDX, 26, ctxt, ops)
#define vcpu_has_sse3() vcpu_has( 1, ECX, 0, ctxt, ops)
#define vcpu_has_cx16() vcpu_has( 1, ECX, 13, ctxt, ops)
+#define vcpu_has_sse4_1() vcpu_has( 1, ECX, 19, ctxt, ops)
#define vcpu_has_sse4_2() vcpu_has( 1, ECX, 20, ctxt, ops)
#define vcpu_has_movbe() vcpu_has( 1, ECX, 22, ctxt, ops)
#define vcpu_has_popcnt() vcpu_has( 1, ECX, 23, ctxt, ops)
case X86EMUL_OPC_VEX_66(0x0f, 0x7f): /* vmovdqa {x,y}mm,{x,y}mm/m128 */
case X86EMUL_OPC_F3(0x0f, 0x7f): /* movdqu xmm,xmm/m128 */
case X86EMUL_OPC_VEX_F3(0x0f, 0x7f): /* vmovdqu {x,y}mm,{x,y}mm/mem */
+ movdqa:
d |= TwoOp;
op_bytes = 16 << vex.l;
if ( vex.opcx != vex_none )
sfence = true;
break;
+ case X86EMUL_OPC_66(0x0f38, 0x2a): /* movntdqa m128,xmm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x2a): /* vmovntdqa mem,{x,y}mm */
+ generate_exception_if(ea.type != OP_MEM, EXC_UD);
+ /* Ignore the non-temporal hint for now, using movdqa instead. */
+ asm volatile ( "mfence" ::: "memory" );
+ b = 0x6f;
+ if ( vex.opcx == vex_none )
+ vcpu_must_have(sse4_1);
+ else
+ {
+ vex.opcx = vex_0f;
+ if ( vex.l )
+ vcpu_must_have(avx2);
+ }
+ state->simd_size = simd_packed_int;
+ goto movdqa;
+
case X86EMUL_OPC(0x0f38, 0xf0): /* movbe m,r */
case X86EMUL_OPC(0x0f38, 0xf1): /* movbe r,m */
vcpu_must_have(movbe);