I.e. those not being equivalents of SSEn ones.
There's one necessary change to generic code: Faulting behavior of
VMASKMOVP{S,D} requires us to do partial reads/writes.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
tools/tests/x86_emulator/*.bin
tools/tests/x86_emulator/*.tmp
tools/tests/x86_emulator/asm
-tools/tests/x86_emulator/avx*.h
+tools/tests/x86_emulator/avx*.[ch]
tools/tests/x86_emulator/blowfish.h
tools/tests/x86_emulator/sse*.[ch]
tools/tests/x86_emulator/test_x86_emulator
run: $(TARGET)
./$(TARGET)
-SIMD := sse sse2 sse4
-TESTCASES := blowfish $(SIMD) $(addsuffix -avx,$(filter sse%,$(SIMD)))
+SIMD := sse sse2 sse4 avx
+TESTCASES := blowfish $(SIMD) sse2-avx sse4-avx
blowfish-cflags := ""
blowfish-cflags-x86_32 := "-mno-accumulate-outgoing-args -Dstatic="
sse4-vecs := $(sse2-vecs)
sse4-ints := $(sse2-ints)
sse4-flts := $(sse2-flts)
+avx-vecs := 16 32
+avx-ints :=
+avx-flts := 4 8
# When converting SSE to AVX, have the compiler avoid XMM0 to widen
# coverage of the VEX.vvvv checks in the emulator. We must not do this,
# however, for SSE4.1 and later, as there are instructions with XMM0 as
# an implicit operand.
-sse2avx-sse := -ffixed-xmm0 -Wa,-msse2avx
-sse2avx-sse2 := $(sse2avx-sse)
+sse2avx-sse2 := -ffixed-xmm0 -Wa,-msse2avx
sse2avx-sse4 := -Wa,-msse2avx
+# For AVX and later, have the compiler avoid XMM0 to widen coverage of
+# the VEX.vvvv checks in the emulator.
+non-sse = $(if $(filter sse%,$(1)),,-ffixed-xmm0)
+
define simd-defs
$(1)-cflags := \
$(foreach vec,$($(1)-vecs), \
$(foreach int,$($(1)-ints), \
- "-D_$(vec)i$(int) -m$(1) -O2 -DVEC_SIZE=$(vec) -DINT_SIZE=$(int)" \
- "-D_$(vec)u$(int) -m$(1) -O2 -DVEC_SIZE=$(vec) -DUINT_SIZE=$(int)") \
+ "-D_$(vec)i$(int) -m$(1) $(call non-sse,$(1)) -O2 -DVEC_SIZE=$(vec) -DINT_SIZE=$(int)" \
+ "-D_$(vec)u$(int) -m$(1) $(call non-sse,$(1)) -O2 -DVEC_SIZE=$(vec) -DUINT_SIZE=$(int)") \
$(foreach flt,$($(1)-flts), \
- "-D_$(vec)f$(flt) -m$(1) -O2 -DVEC_SIZE=$(vec) -DFLOAT_SIZE=$(flt)")) \
+ "-D_$(vec)f$(flt) -m$(1) $(call non-sse,$(1)) -O2 -DVEC_SIZE=$(vec) -DFLOAT_SIZE=$(flt)")) \
$(foreach flt,$($(1)-flts), \
- "-D_f$(flt) -m$(1) -mfpmath=sse -O2 -DFLOAT_SIZE=$(flt)")
+ "-D_f$(flt) -m$(1) $(call non-sse,$(1)) -mfpmath=sse -O2 -DFLOAT_SIZE=$(flt)")
$(1)-avx-cflags := \
$(foreach vec,$($(1)-vecs), \
$(foreach int,$($(1)-ints), \
"-D_$(vec)i$(int) -m$(1) $(sse2avx-$(1)) -O2 -DVEC_SIZE=$(vec) -DINT_SIZE=$(int)" \
- "-D_$(vec)u$(int) -m$(1) $(sse2avx-$(1)) -O2 -DVEC_SIZE=$(vec) -DUINT_SIZE=$(int)") \
- $(foreach flt,$($(1)-flts), \
- "-D_$(vec)f$(flt) -m$(1) $(sse2avx-$(1)) -O2 -DVEC_SIZE=$(vec) -DFLOAT_SIZE=$(flt)")) \
- $(foreach flt,$($(1)-flts), \
- "-D_f$(flt) -m$(1) -mfpmath=sse $(sse2avx-$(1)) -O2 -DFLOAT_SIZE=$(flt)")
+ "-D_$(vec)u$(int) -m$(1) $(sse2avx-$(1)) -O2 -DVEC_SIZE=$(vec) -DUINT_SIZE=$(int)"))
endef
$(foreach flavor,$(SIMD),$(eval $(call simd-defs,$(flavor))))
#if VEC_SIZE == 8 && defined(__SSE__)
# define to_bool(cmp) (__builtin_ia32_pmovmskb(cmp) == 0xff)
#elif VEC_SIZE == 16
-# if defined(__SSE4_1__)
+# if defined(__AVX__) && defined(FLOAT_SIZE)
+# if ELEM_SIZE == 4
+# define to_bool(cmp) __builtin_ia32_vtestcps(cmp, (vec_t){} == 0)
+# elif ELEM_SIZE == 8
+# define to_bool(cmp) __builtin_ia32_vtestcpd(cmp, (vec_t){} == 0)
+# endif
+# elif defined(__SSE4_1__)
# define to_bool(cmp) __builtin_ia32_ptestc128(cmp, (vdi_t){} == 0)
# elif defined(__SSE__) && ELEM_SIZE == 4
# define to_bool(cmp) (__builtin_ia32_movmskps(cmp) == 0xf)
# define to_bool(cmp) (__builtin_ia32_pmovmskb128(cmp) == 0xffff)
# endif
# endif
+#elif VEC_SIZE == 32
+# if defined(__AVX__) && ELEM_SIZE == 4
+# define to_bool(cmp) (__builtin_ia32_movmskps256(cmp) == 0xff)
+# elif defined(__AVX__) && ELEM_SIZE == 8
+# define to_bool(cmp) (__builtin_ia32_movmskpd256(cmp) == 0xf)
+# endif
#endif
#ifndef to_bool
# elif FLOAT_SIZE == 8
# define to_int(x) __builtin_ia32_cvtdq2pd(__builtin_ia32_cvtpd2dq(x))
# endif
+#elif VEC_SIZE == 32 && defined(__AVX__)
+# if FLOAT_SIZE == 4
+# define to_int(x) __builtin_ia32_cvtdq2ps256(__builtin_ia32_cvtps2dq256(x))
+# elif FLOAT_SIZE == 8
+# define to_int(x) __builtin_ia32_cvtdq2pd256(__builtin_ia32_cvtpd2dq256(x))
+# endif
#endif
#if VEC_SIZE == FLOAT_SIZE
#endif
#if FLOAT_SIZE == 4 && defined(__SSE__)
-# if VEC_SIZE == 16
+# if VEC_SIZE == 32 && defined(__AVX__)
+# define broadcast(x) ({ float t_ = (x); __builtin_ia32_vbroadcastss256(&t_); })
+# define max(x, y) __builtin_ia32_maxps256(x, y)
+# define min(x, y) __builtin_ia32_minps256(x, y)
+# define recip(x) __builtin_ia32_rcpps256(x)
+# define rsqrt(x) __builtin_ia32_rsqrtps256(x)
+# define sqrt(x) __builtin_ia32_sqrtps256(x)
+# define swap(x) ({ \
+ vec_t t_ = __builtin_ia32_vpermilps256(x, 0b00011011); \
+ __builtin_ia32_vperm2f128_ps256(t_, t_, 0b00000001); \
+})
+# define swap2(x) ({ \
+ vec_t t_ = __builtin_ia32_vpermilvarps256(x, __builtin_ia32_cvtps2dq256(inv) - 1); \
+ __builtin_ia32_vperm2f128_ps256(t_, t_, 0b00000001); \
+})
+# elif VEC_SIZE == 16
+# ifdef __AVX__
+# define broadcast(x) ({ float t_ = (x); __builtin_ia32_vbroadcastss(&t_); })
+# endif
# define interleave_hi(x, y) __builtin_ia32_unpckhps(x, y)
# define interleave_lo(x, y) __builtin_ia32_unpcklps(x, y)
# define max(x, y) __builtin_ia32_maxps(x, y)
# define rsqrt(x) __builtin_ia32_rsqrtps(x)
# define sqrt(x) __builtin_ia32_sqrtps(x)
# define swap(x) __builtin_ia32_shufps(x, x, 0b00011011)
+# ifdef __AVX__
+# define swap2(x) __builtin_ia32_vpermilvarps(x, __builtin_ia32_cvtps2dq(inv) - 1)
+# endif
# elif VEC_SIZE == 4
# define recip(x) scalar_1op(x, "rcpss %[in], %[out]")
# define rsqrt(x) scalar_1op(x, "rsqrtss %[in], %[out]")
# define sqrt(x) scalar_1op(x, "sqrtss %[in], %[out]")
# endif
#elif FLOAT_SIZE == 8 && defined(__SSE2__)
-# if VEC_SIZE == 16
+# if VEC_SIZE == 32 && defined(__AVX__)
+# define broadcast(x) ({ double t_ = (x); __builtin_ia32_vbroadcastsd256(&t_); })
+# define max(x, y) __builtin_ia32_maxpd256(x, y)
+# define min(x, y) __builtin_ia32_minpd256(x, y)
+# define recip(x) ({ \
+ float __attribute__((vector_size(16))) t_ = __builtin_ia32_cvtpd2ps256(x); \
+ t_ = __builtin_ia32_vextractf128_ps256( \
+ __builtin_ia32_rcpps256( \
+ __builtin_ia32_vbroadcastf128_ps256(&t_)), 0); \
+ __builtin_ia32_cvtps2pd256(t_); \
+})
+# define rsqrt(x) ({ \
+ float __attribute__((vector_size(16))) t1_ = __builtin_ia32_cvtpd2ps256(x); \
+ float __attribute__((vector_size(32))) t2_ = __builtin_ia32_vinsertf128_ps256((typeof(t2_)){}, t1_, 0); \
+ t2_ = __builtin_ia32_vinsertf128_ps256(t2_, t1_, 1); \
+ t1_ = __builtin_ia32_vextractf128_ps256(__builtin_ia32_rsqrtps256(t2_), 0); \
+ __builtin_ia32_cvtps2pd256(t1_); \
+})
+# define sqrt(x) __builtin_ia32_sqrtpd256(x)
+# define swap(x) ({ \
+ vec_t t_ = __builtin_ia32_vpermilpd256(x, 0b00000101); \
+ __builtin_ia32_vperm2f128_pd256(t_, t_, 0b00000001); \
+})
+# elif VEC_SIZE == 16
# define interleave_hi(x, y) __builtin_ia32_unpckhpd(x, y)
# define interleave_lo(x, y) __builtin_ia32_unpcklpd(x, y)
# define max(x, y) __builtin_ia32_maxpd(x, y)
# define rsqrt(x) __builtin_ia32_cvtps2pd(__builtin_ia32_rsqrtps(__builtin_ia32_cvtpd2ps(x)))
# define sqrt(x) __builtin_ia32_sqrtpd(x)
# define swap(x) __builtin_ia32_shufpd(x, x, 0b01)
+# ifdef __AVX__
+# define swap2(x) __builtin_ia32_vpermilvarpd(x, __builtin_ia32_pmovsxdq128( \
+ __builtin_ia32_cvtpd2dq(inv) - 1) << 1)
+# endif
# elif VEC_SIZE == 8
# define recip(x) scalar_1op(x, "cvtsd2ss %[in], %[out]; rcpss %[out], %[out]; cvtss2sd %[out], %[out]")
# define rsqrt(x) scalar_1op(x, "cvtsd2ss %[in], %[out]; rsqrtss %[out], %[out]; cvtss2sd %[out], %[out]")
# define hadd(x, y) __builtin_ia32_haddpd(x, y)
# define hsub(x, y) __builtin_ia32_hsubpd(x, y)
# endif
+#elif VEC_SIZE == 32 && defined(__AVX__)
+# if FLOAT_SIZE == 4
+# define addsub(x, y) __builtin_ia32_addsubps256(x, y)
+# define dup_hi(x) __builtin_ia32_movshdup256(x)
+# define dup_lo(x) __builtin_ia32_movsldup256(x)
+# define hadd(x, y) ({ \
+ vec_t t_ = __builtin_ia32_haddps256(x, y); \
+ (vec_t){t_[0], t_[1], t_[4], t_[5], t_[2], t_[3], t_[6], t_[7]}; \
+})
+# define hsub(x, y) ({ \
+ vec_t t_ = __builtin_ia32_hsubps256(x, y); \
+ (vec_t){t_[0], t_[1], t_[4], t_[5], t_[2], t_[3], t_[6], t_[7]}; \
+})
+# elif FLOAT_SIZE == 8
+# define addsub(x, y) __builtin_ia32_addsubpd256(x, y)
+# define dup_lo(x) __builtin_ia32_movddup256(x)
+# define hadd(x, y) ({ \
+ vec_t t_ = __builtin_ia32_haddpd256(x, y); \
+ (vec_t){t_[0], t_[2], t_[1], t_[3]}; \
+})
+# define hsub(x, y) ({ \
+ vec_t t_ = __builtin_ia32_hsubpd256(x, y); \
+ (vec_t){t_[0], t_[2], t_[1], t_[3]}; \
+})
+# endif
#endif
#if VEC_SIZE == 16 && defined(__SSSE3__)
# if INT_SIZE == 1
# define mix(x, y) __builtin_ia32_blendpd(x, y, 0b10)
# endif
#endif
+#if VEC_SIZE == 32 && defined(__AVX__)
+# if FLOAT_SIZE == 4
+# define dot_product(x, y) ({ \
+ vec_t t_ = __builtin_ia32_dpps256(x, y, 0b11110001); \
+ (vec_t){t_[0] + t_[4]}; \
+})
+# define mix(x, y) __builtin_ia32_blendps256(x, y, 0b10101010)
+# define select(d, x, y, m) (*(d) = __builtin_ia32_blendvps256(y, x, m))
+# define select2(d, x, y, m) ({ \
+ vsi_t m_ = (vsi_t)(m); \
+ *(d) = __builtin_ia32_maskloadps256(&(x), m_); \
+ __builtin_ia32_maskstoreps256(d, ~m_, y); \
+})
+# define trunc(x) __builtin_ia32_roundps256(x, 0b1011)
+# elif FLOAT_SIZE == 8
+# define mix(x, y) __builtin_ia32_blendpd256(x, y, 0b1010)
+# define select(d, x, y, m) (*(d) = __builtin_ia32_blendvpd256(y, x, m))
+# define select2(d, x, y, m) ({ \
+ vdi_t m_ = (vdi_t)(m); \
+ *(d) = __builtin_ia32_maskloadpd256(&(x), m_); \
+ __builtin_ia32_maskstorepd256(d, ~m_, y); \
+})
+# define trunc(x) __builtin_ia32_roundpd256(x, 0b1011)
+# endif
+#endif
#if VEC_SIZE == FLOAT_SIZE
# define max(x, y) ((vec_t){({ typeof(x[0]) x_ = (x)[0], y_ = (y)[0]; x_ > y_ ? x_ : y_; })})
# define min(x, y) ((vec_t){({ typeof(x[0]) x_ = (x)[0], y_ = (y)[0]; x_ < y_ ? x_ : y_; })})
if ( !to_bool(swap(src) == inv) ) return __LINE__;
#endif
+#ifdef swap2
+ touch(src);
+ if ( !to_bool(swap2(src) == inv) ) return __LINE__;
+#endif
+
+#if defined(broadcast)
+ if ( !to_bool(broadcast(ELEM_COUNT + 1) == src + inv) ) return __LINE__;
+#endif
+
#if defined(interleave_lo) && defined(interleave_hi)
touch(src);
x = interleave_lo(inv, src);
if ( !to_bool(z == y) ) return __LINE__;
#endif
+#ifdef select2
+# ifdef UINT_SIZE
+ select2(&z, src, inv, alt);
+# else
+ select2(&z, src, inv, alt > 0);
+# endif
+ if ( !to_bool(z == y) ) return __LINE__;
+#endif
+
#ifdef mix
touch(src);
touch(inv);
#include "sse.h"
#include "sse2.h"
#include "sse4.h"
-#include "sse-avx.h"
#include "sse2-avx.h"
#include "sse4-avx.h"
+#include "avx.h"
#define verbose false /* Switch to true for far more logging. */
{
return cpu_has_avx;
}
-#define simd_check_sse_avx simd_check_avx
#define simd_check_sse2_avx simd_check_avx
#define simd_check_sse4_avx simd_check_avx
SIMD(SSE4 packed u32, sse4, 16u4),
SIMD(SSE4 packed s64, sse4, 16i8),
SIMD(SSE4 packed u64, sse4, 16u8),
- SIMD(SSE/AVX scalar single, sse_avx, f4),
- SIMD(SSE/AVX packed single, sse_avx, 16f4),
- SIMD(SSE2/AVX scalar single, sse2_avx, f4),
- SIMD(SSE2/AVX packed single, sse2_avx, 16f4),
- SIMD(SSE2/AVX scalar double, sse2_avx, f8),
- SIMD(SSE2/AVX packed double, sse2_avx, 16f8),
SIMD(SSE2/AVX packed s8, sse2_avx, 16i1),
SIMD(SSE2/AVX packed u8, sse2_avx, 16u1),
SIMD(SSE2/AVX packed s16, sse2_avx, 16i2),
SIMD(SSE2/AVX packed u32, sse2_avx, 16u4),
SIMD(SSE2/AVX packed s64, sse2_avx, 16i8),
SIMD(SSE2/AVX packed u64, sse2_avx, 16u8),
- SIMD(SSE4/AVX scalar single, sse4_avx, f4),
- SIMD(SSE4/AVX packed single, sse4_avx, 16f4),
- SIMD(SSE4/AVX scalar double, sse4_avx, f8),
- SIMD(SSE4/AVX packed double, sse4_avx, 16f8),
SIMD(SSE4/AVX packed s8, sse4_avx, 16i1),
SIMD(SSE4/AVX packed u8, sse4_avx, 16u1),
SIMD(SSE4/AVX packed s16, sse4_avx, 16i2),
SIMD(SSE4/AVX packed u32, sse4_avx, 16u4),
SIMD(SSE4/AVX packed s64, sse4_avx, 16i8),
SIMD(SSE4/AVX packed u64, sse4_avx, 16u8),
+ SIMD(AVX scalar single, avx, f4),
+ SIMD(AVX 128bit single, avx, 16f4),
+ SIMD(AVX 256bit single, avx, 32f4),
+ SIMD(AVX scalar double, avx, f8),
+ SIMD(AVX 128bit double, avx, 16f8),
+ SIMD(AVX 256bit double, avx, 32f8),
#undef SIMD_
#undef SIMD
};
else
printf("skipped\n");
+ /*
+ * The following "maskmov" tests are not only making sure the written data
+ * is correct, but verify (by placing operands on the mapping boundaries)
+ * that elements controlled by clear mask bits aren't being accessed.
+ */
+ printf("%-40s", "Testing vmaskmovps %xmm1,%xmm2,(%edx)...");
+ if ( stack_exec && cpu_has_avx )
+ {
+ decl_insn(vmaskmovps);
+
+ asm volatile ( "vxorps %%xmm1, %%xmm1, %%xmm1\n\t"
+ "vcmpeqss %%xmm1, %%xmm1, %%xmm2\n\t"
+ put_insn(vmaskmovps, "vmaskmovps %%xmm1, %%xmm2, (%0)")
+ :: "d" (NULL) );
+
+ memset(res + MMAP_SZ / sizeof(*res) - 8, 0xdb, 32);
+ set_insn(vmaskmovps);
+ regs.edx = (unsigned long)res + MMAP_SZ - 4;
+ rc = x86_emulate(&ctxt, &emulops);
+ if ( rc != X86EMUL_OKAY || !check_eip(vmaskmovps) ||
+ res[MMAP_SZ / sizeof(*res) - 1] ||
+ memcmp(res + MMAP_SZ / sizeof(*res) - 8,
+ res + MMAP_SZ / sizeof(*res) - 4, 12) )
+ goto fail;
+
+ asm volatile ( "vinsertps $0b00110111, %xmm2, %xmm2, %xmm2" );
+ memset(res, 0xdb, 32);
+ set_insn(vmaskmovps);
+ regs.edx = (unsigned long)(res - 3);
+ rc = x86_emulate(&ctxt, &emulops);
+ if ( rc != X86EMUL_OKAY || !check_eip(vmaskmovps) ||
+ res[0] || memcmp(res + 1, res + 4, 12) )
+ goto fail;
+
+ printf("okay\n");
+ }
+ else
+ printf("skipped\n");
+
+ printf("%-40s", "Testing vmaskmovpd %xmm1,%xmm2,(%edx)...");
+ if ( stack_exec && cpu_has_avx )
+ {
+ decl_insn(vmaskmovpd);
+
+ asm volatile ( "vxorpd %%xmm1, %%xmm1, %%xmm1\n\t"
+ "vcmpeqsd %%xmm1, %%xmm1, %%xmm2\n\t"
+ put_insn(vmaskmovpd, "vmaskmovpd %%xmm1, %%xmm2, (%0)")
+ :: "d" (NULL) );
+
+ memset(res + MMAP_SZ / sizeof(*res) - 8, 0xdb, 32);
+ set_insn(vmaskmovpd);
+ regs.edx = (unsigned long)res + MMAP_SZ - 8;
+ rc = x86_emulate(&ctxt, &emulops);
+ if ( rc != X86EMUL_OKAY || !check_eip(vmaskmovpd) ||
+ res[MMAP_SZ / sizeof(*res) - 1] ||
+ res[MMAP_SZ / sizeof(*res) - 2] ||
+ memcmp(res + MMAP_SZ / sizeof(*res) - 8,
+ res + MMAP_SZ / sizeof(*res) - 4, 8) )
+ goto fail;
+
+ asm volatile ( "vmovddup %xmm2, %xmm2\n\t"
+ "vmovsd %xmm1, %xmm2, %xmm2" );
+ memset(res, 0xdb, 32);
+ set_insn(vmaskmovpd);
+ regs.edx = (unsigned long)(res - 2);
+ rc = x86_emulate(&ctxt, &emulops);
+ if ( rc != X86EMUL_OKAY || !check_eip(vmaskmovpd) ||
+ res[0] || res[1] || memcmp(res + 2, res + 4, 8) )
+ goto fail;
+
+ printf("okay\n");
+ }
+ else
+ printf("skipped\n");
+
printf("%-40s", "Testing stmxcsr (%edx)...");
if ( cpu_has_sse )
{
*/
simd_scalar_fp,
+ /*
+ * 128 bits of integer or floating point data, with no further
+ * formatting information.
+ */
+ simd_128,
+
/* Operand size encoded in non-standard way. */
simd_other
};
uint8_t vsib:1;
} ext0f38_table[256] = {
[0x00 ... 0x0b] = { .simd_size = simd_packed_int },
+ [0x0c ... 0x0f] = { .simd_size = simd_packed_fp },
[0x10] = { .simd_size = simd_packed_int },
[0x14 ... 0x15] = { .simd_size = simd_packed_fp },
[0x17] = { .simd_size = simd_packed_int, .two_op = 1 },
+ [0x18 ... 0x19] = { .simd_size = simd_scalar_fp, .two_op = 1 },
+ [0x1a] = { .simd_size = simd_128, .two_op = 1 },
[0x1c ... 0x1e] = { .simd_size = simd_packed_int, .two_op = 1 },
[0x20 ... 0x25] = { .simd_size = simd_other, .two_op = 1 },
[0x28 ... 0x29] = { .simd_size = simd_packed_int },
[0x2a] = { .simd_size = simd_packed_int, .two_op = 1 },
[0x2b] = { .simd_size = simd_packed_int },
+ [0x2c ... 0x2d] = { .simd_size = simd_other },
+ [0x2e ... 0x2f] = { .simd_size = simd_other, .to_mem = 1 },
[0x30 ... 0x35] = { .simd_size = simd_other, .two_op = 1 },
[0x37 ... 0x3f] = { .simd_size = simd_packed_int },
[0x40] = { .simd_size = simd_packed_int },
uint8_t two_op:1;
uint8_t four_op:1;
} ext0f3a_table[256] = {
+ [0x04 ... 0x05] = { .simd_size = simd_packed_fp, .two_op = 1 },
+ [0x06] = { .simd_size = simd_packed_fp },
[0x08 ... 0x09] = { .simd_size = simd_packed_fp, .two_op = 1 },
[0x0a ... 0x0b] = { .simd_size = simd_scalar_fp },
[0x0c ... 0x0d] = { .simd_size = simd_packed_fp },
[0x0e ... 0x0f] = { .simd_size = simd_packed_int },
[0x14 ... 0x17] = { .simd_size = simd_none, .to_mem = 1, .two_op = 1 },
+ [0x18] = { .simd_size = simd_128 },
+ [0x19] = { .simd_size = simd_128, .to_mem = 1, .two_op = 1 },
[0x20] = { .simd_size = simd_none },
[0x21] = { .simd_size = simd_other },
[0x22] = { .simd_size = simd_none },
buf_ + 3; \
})
+#define copy_VEX(ptr, vex) ({ \
+ if ( !mode_64bit() ) \
+ (vex).reg |= 8; \
+ (ptr)[0 - PFX_BYTES] = 0xc4; \
+ (ptr)[1 - PFX_BYTES] = (vex).raw[0]; \
+ (ptr)[2 - PFX_BYTES] = (vex).raw[1]; \
+ container_of((ptr) + 1 - PFX_BYTES, typeof(vex), raw[0]); \
+})
+
#define copy_REX_VEX(ptr, rex, vex) do { \
if ( (vex).opcx != vex_none ) \
- { \
- if ( !mode_64bit() ) \
- vex.reg |= 8; \
- (ptr)[0 - PFX_BYTES] = 0xc4; \
- (ptr)[1 - PFX_BYTES] = (vex).raw[0]; \
- (ptr)[2 - PFX_BYTES] = (vex).raw[1]; \
- } \
+ copy_VEX(ptr, vex); \
else \
{ \
if ( (vex).pfx ) \
op_bytes = 4 << (ctxt->opcode & 1);
break;
+ case simd_128:
+ op_bytes = 16;
+ break;
+
default:
op_bytes = 0;
break;
struct x86_emulate_state state;
int rc;
uint8_t b, d, *opc = NULL;
+ unsigned int first_byte = 0;
bool singlestep = (_regs.eflags & X86_EFLAGS_TF) &&
!is_branch_step(ctxt, ops);
bool sfence = false;
fic.insn_bytes = PFX_BYTES + 3;
break;
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x19): /* vbroadcastsd m64,ymm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x1a): /* vbroadcastf128 m128,ymm */
+ generate_exception_if(!vex.l, EXC_UD);
+ /* fall through */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x18): /* vbroadcastss m32,{x,y}mm */
+ generate_exception_if(ea.type != OP_MEM, EXC_UD);
+ /* fall through */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x0c): /* vpermilps {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x0d): /* vpermilpd {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ generate_exception_if(vex.w, EXC_UD);
+ goto simd_0f_avx;
+
case X86EMUL_OPC_66(0x0f38, 0x20): /* pmovsxbw xmm/m64,xmm */
case X86EMUL_OPC_66(0x0f38, 0x21): /* pmovsxbd xmm/m32,xmm */
case X86EMUL_OPC_66(0x0f38, 0x22): /* pmovsxbq xmm/m16,xmm */
host_and_vcpu_must_have(sse4_1);
goto simd_0f38_common;
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x0e): /* vtestps {x,y}mm/mem,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x0f): /* vtestpd {x,y}mm/mem,{x,y}mm */
+ generate_exception_if(vex.w, EXC_UD);
+ /* fall through */
case X86EMUL_OPC_66(0x0f38, 0x17): /* ptest xmm/m128,xmm */
case X86EMUL_OPC_VEX_66(0x0f38, 0x17): /* vptest {x,y}mm/mem,{x,y}mm */
if ( vex.opcx == vex_none )
}
goto movdqa;
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x2c): /* vmaskmovps mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x2d): /* vmaskmovpd mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x2e): /* vmaskmovps {x,y}mm,{x,y}mm,mem */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x2f): /* vmaskmovpd {x,y}mm,{x,y}mm,mem */
+ {
+ typeof(vex) *pvex;
+
+ generate_exception_if(ea.type != OP_MEM || vex.w, EXC_UD);
+ host_and_vcpu_must_have(avx);
+ get_fpu(X86EMUL_FPU_ymm, &fic);
+
+ /*
+ * While we can't reasonably provide fully correct behavior here
+ * (in particular, for writes, avoiding the memory read in anticipation
+ * of all elements in the range eventually being written), we can (and
+ * should) still limit the memory access to the smallest possible range
+ * (suppressing it altogether if all mask bits are clear), to provide
+ * correct faulting behavior. Read the mask bits via vmovmskp{s,d}
+ * for that purpose.
+ */
+ opc = init_prefixes(stub);
+ pvex = copy_VEX(opc, vex);
+ pvex->opcx = vex_0f;
+ if ( !(b & 1) )
+ pvex->pfx = vex_none;
+ opc[0] = 0x50; /* vmovmskp{s,d} */
+ /* Use %rax as GPR destination and VEX.vvvv as source. */
+ pvex->r = 1;
+ pvex->b = !mode_64bit() || (vex.reg >> 3);
+ opc[1] = 0xc0 | (~vex.reg & 7);
+ pvex->reg = 0xf;
+ opc[2] = 0xc3;
+
+ invoke_stub("", "", "=a" (ea.val) : [dummy] "i" (0));
+ put_stub(stub);
+
+ if ( !ea.val )
+ goto complete_insn;
+
+ op_bytes = 4 << (b & 1);
+ first_byte = __builtin_ctz(ea.val);
+ ea.val >>= first_byte;
+ first_byte *= op_bytes;
+ op_bytes *= 32 - __builtin_clz(ea.val);
+
+ /*
+ * Even for the memory write variant a memory read is needed, unless
+ * all set mask bits are contiguous.
+ */
+ if ( ea.val & (ea.val + 1) )
+ d = (d & ~SrcMask) | SrcMem;
+
+ opc = init_prefixes(stub);
+ opc[0] = b;
+ /* Convert memory operand to (%rAX). */
+ rex_prefix &= ~REX_B;
+ vex.b = 1;
+ opc[1] = modrm & 0x38;
+ fic.insn_bytes = PFX_BYTES + 2;
+
+ break;
+ }
+
case X86EMUL_OPC_66(0x0f38, 0x37): /* pcmpgtq xmm/m128,xmm */
host_and_vcpu_must_have(sse4_2);
goto simd_0f38_common;
: "0" ((uint32_t)src.val), "rm" (_regs.edx) );
break;
+ case X86EMUL_OPC_VEX_66(0x0f3a, 0x06): /* vperm2f128 $imm8,ymm/m256,ymm,ymm */
+ case X86EMUL_OPC_VEX_66(0x0f3a, 0x18): /* vinsertf128 $imm8,xmm/m128,ymm,ymm */
+ case X86EMUL_OPC_VEX_66(0x0f3a, 0x19): /* vextractf128 $imm8,ymm,xmm/m128 */
+ generate_exception_if(!vex.l, EXC_UD);
+ /* fall through */
+ case X86EMUL_OPC_VEX_66(0x0f3a, 0x04): /* vpermilps $imm8,{x,y}mm/mem,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f3a, 0x05): /* vpermilpd $imm8,{x,y}mm/mem,{x,y}mm */
+ generate_exception_if(vex.w, EXC_UD);
+ goto simd_0f_imm8_avx;
+
case X86EMUL_OPC(0x0f3a, 0x0f): /* palignr $imm8,mm/m64,mm */
case X86EMUL_OPC_66(0x0f3a, 0x0f): /* palignr $imm8,xmm/m128,xmm */
host_and_vcpu_must_have(ssse3);
switch ( d & SrcMask )
{
case SrcMem:
- rc = ops->read(ea.mem.seg, ea.mem.off, mmvalp, op_bytes, ctxt);
+ rc = ops->read(ea.mem.seg, ea.mem.off + first_byte,
+ (void *)mmvalp + first_byte, op_bytes,
+ ctxt);
if ( rc != X86EMUL_OKAY )
goto done;
/* fall through */
if ( (d & DstMask) == DstMem )
{
fail_if(!ops->write); /* Check before running the stub. */
- ASSERT(d & Mov);
+ if ( (d & SrcMask) == SrcMem )
+ d |= Mov; /* Force memory write to occur below. */
+
+ switch ( ctxt->opcode )
+ {
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x2e): /* vmaskmovps */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x2f): /* vmaskmovpd */
+ /* These have merge semantics; force write to occur. */
+ d |= Mov;
+ break;
+ default:
+ ASSERT(d & Mov);
+ break;
+ }
+
dst.type = OP_MEM;
dst.bytes = op_bytes;
dst.mem = ea.mem;
else
{
fail_if(!ops->write);
- rc = ops->write(dst.mem.seg, dst.mem.off,
- !state->simd_size ? &dst.val : (void *)mmvalp,
+ rc = ops->write(dst.mem.seg, dst.mem.off + first_byte,
+ !state->simd_size ? &dst.val
+ : (void *)mmvalp + first_byte,
dst.bytes, ctxt);
if ( sfence )
asm volatile ( "sfence" ::: "memory" );