run: $(TARGET)
./$(TARGET)
-SIMD := sse sse2 sse4 avx
+SIMD := sse sse2 sse4 avx avx2
FMA := fma4 fma
-TESTCASES := blowfish $(SIMD) sse2-avx sse4-avx $(FMA)
+TESTCASES := blowfish $(SIMD) $(FMA)
blowfish-cflags := ""
blowfish-cflags-x86_32 := "-mno-accumulate-outgoing-args -Dstatic="
fma-vecs := $(avx-vecs)
fma-ints :=
fma-flts := $(avx-flts)
-
-# When converting SSE to AVX, have the compiler avoid XMM0 to widen
-# coverage of the VEX.vvvv checks in the emulator. We must not do this,
-# however, for SSE4.1 and later, as there are instructions with XMM0 as
-# an implicit operand.
-sse2avx-sse2 := -ffixed-xmm0 -Wa,-msse2avx
-sse2avx-sse4 := -Wa,-msse2avx
+avx2-vecs := $(avx-vecs)
+avx2-ints := 1 2 4 8
+avx2-flts := 4 8
# For AVX and later, have the compiler avoid XMM0 to widen coverage of
# the VEX.vvvv checks in the emulator.
"-D_$(vec)f$(flt) -m$(1) $(call non-sse,$(1)) -Os -DVEC_SIZE=$(vec) -DFLOAT_SIZE=$(flt)")) \
$(foreach flt,$($(1)-flts), \
"-D_f$(flt) -m$(1) $(call non-sse,$(1)) -mfpmath=sse -Os -DFLOAT_SIZE=$(flt)")
-$(1)-avx-cflags := \
- $(foreach vec,$($(1)-vecs), \
- $(foreach int,$($(1)-ints), \
- "-D_$(vec)i$(int) -m$(1) $(sse2avx-$(1)) -Os -DVEC_SIZE=$(vec) -DINT_SIZE=$(int)" \
- "-D_$(vec)u$(int) -m$(1) $(sse2avx-$(1)) -Os -DVEC_SIZE=$(vec) -DUINT_SIZE=$(int)"))
endef
$(foreach flavor,$(SIMD) $(FMA),$(eval $(call simd-defs,$(flavor))))
)
mv $@.new $@
-$(addsuffix .c,$(SIMD)) $(addsuffix -avx.c,$(filter sse%,$(SIMD))):
+$(addsuffix .c,$(SIMD)):
ln -sf simd.c $@
$(addsuffix .c,$(FMA)):
ln -sf simd-fma.c $@
-$(addsuffix .o,$(SIMD) $(FMA)) $(addsuffix -avx.o,$(filter sse%,$(SIMD))): simd.h
+$(addsuffix .o,$(SIMD) $(FMA)): simd.h
$(TARGET): x86-emulate.o test_x86_emulator.o
$(HOSTCC) $(HOSTCFLAGS) -o $@ $^
# endif
# endif
#elif VEC_SIZE == 32
-# if defined(__AVX__) && ELEM_SIZE == 4
+# if defined(__AVX2__)
+# define to_bool(cmp) __builtin_ia32_ptestc256(cmp, (vdi_t){} == 0)
+# elif defined(__AVX__) && ELEM_SIZE == 4
# define to_bool(cmp) (__builtin_ia32_movmskps256(cmp) == 0xff)
# elif defined(__AVX__) && ELEM_SIZE == 8
# define to_bool(cmp) (__builtin_ia32_movmskpd256(cmp) == 0xf)
vec_t t_ = __builtin_ia32_vpermilps256(x, 0b00011011); \
__builtin_ia32_vperm2f128_ps256(t_, t_, 0b00000001); \
})
-# define swap2(x) ({ \
- vec_t t_ = __builtin_ia32_vpermilvarps256(x, __builtin_ia32_cvtps2dq256(inv) - 1); \
- __builtin_ia32_vperm2f128_ps256(t_, t_, 0b00000001); \
+# ifdef __AVX2__
+# define swap2(x) __builtin_ia32_permvarsf256(x, __builtin_ia32_cvtps2dq256(inv) - 1)
+# else
+# define swap2(x) ({ \
+ vec_t t_ = __builtin_ia32_vpermilvarps256(x, __builtin_ia32_cvtps2dq256(inv) - 1); \
+ __builtin_ia32_vperm2f128_ps256(t_, t_, 0b00000001); \
})
+# endif
# elif VEC_SIZE == 16
# if defined(__AVX2__)
# define broadcast(x) __builtin_ia32_vbroadcastss_ps((vec_t){ x })
vec_t t_ = __builtin_ia32_vpermilpd256(x, 0b00000101); \
__builtin_ia32_vperm2f128_pd256(t_, t_, 0b00000001); \
})
+# ifdef __AVX2__
+# define swap2(x) __builtin_ia32_permdf256(x, 0b00011011)
+# endif
# elif VEC_SIZE == 16
# define interleave_hi(x, y) __builtin_ia32_unpckhpd(x, y)
# define interleave_lo(x, y) __builtin_ia32_unpcklpd(x, y)
__builtin_ia32_maskmovdqu((vqi_t)(x), m_, d_); \
__builtin_ia32_maskmovdqu((vqi_t)(y), ~m_, d_); \
})
+#elif VEC_SIZE == 32 && defined(__AVX2__)
+# define swap_lanes(x, y, func, type) ({ \
+ long long __attribute__((vector_size(16))) t_ = __builtin_ia32_extract128i256((vdi_t)(y), 0); \
+ type t1_ = (type)__builtin_ia32_insert128i256((vdi_t)(x), t_, 1), t2_; \
+ t_ = __builtin_ia32_extract128i256((vdi_t)(x), 1); \
+ t2_ = (type)__builtin_ia32_insert128i256((vdi_t)(y), t_, 0); \
+ func(t1_, t2_); \
+})
+# if INT_SIZE == 1 || UINT_SIZE == 1
+# define broadcast(x) ({ char s_ = (x); vec_t d_; asm ( "vpbroadcastb %1,%0" : "=x" (d_) : "m" (s_)); d_; })
+# define copysignz(x, y) ((vec_t)__builtin_ia32_psignb256((vqi_t)(x), (vqi_t)(y)))
+# define rotr(x, n) ((vec_t)__builtin_ia32_palignr256(__builtin_ia32_permti256((vdi_t)(x), (vdi_t)(x), 0b00000001), \
+ (vdi_t)(x), (n) * 8))
+# elif INT_SIZE == 2 || UINT_SIZE == 2
+# define broadcast(x) ({ short s_ = (x); vec_t d_; asm ( "vpbroadcastw %1,%0" : "=x" (d_) : "m" (s_)); d_; })
+# define copysignz(x, y) ((vec_t)__builtin_ia32_psignw256((vhi_t)(x), (vhi_t)(y)))
+# define hadd(x, y) ((vec_t)swap_lanes(x, y, __builtin_ia32_phaddw256, vhi_t))
+# define hsub(x, y) ((vec_t)swap_lanes(x, y, __builtin_ia32_phsubw256, vhi_t))
+# define mix(x, y) ((vec_t)__builtin_ia32_pblendw256((vhi_t)(x), (vhi_t)(y), 0b10101010))
+# define rotr(x, n) ((vec_t)__builtin_ia32_palignr256(__builtin_ia32_permti256((vdi_t)(x), (vdi_t)(x), 0b00000001), \
+ (vdi_t)(x), (n) * 16))
+# elif INT_SIZE == 4 || UINT_SIZE == 4
+# define broadcast(x) ({ int s_ = (x); vec_t d_; asm ( "vpbroadcastd %1,%0" : "=x" (d_) : "m" (s_)); d_; })
+# define copysignz(x, y) ((vec_t)__builtin_ia32_psignd256((vsi_t)(x), (vsi_t)(y)))
+# define hadd(x, y) ((vec_t)swap_lanes(x, y, __builtin_ia32_phaddd256, vsi_t))
+# define hsub(x, y) ((vec_t)swap_lanes(x, y, __builtin_ia32_phsubd256, vsi_t))
+# define mix(x, y) ((vec_t)__builtin_ia32_pblendd256((vsi_t)(x), (vsi_t)(y), 0b10101010))
+# define rotr(x, n) ((vec_t)__builtin_ia32_palignr256(__builtin_ia32_permti256((vdi_t)(x), (vdi_t)(x), 0b00000001), \
+ (vdi_t)(x), (n) * 32))
+# define select(d, x, y, m) ({ \
+ vsi_t m_ = (vsi_t)(m); \
+ *(d) = (vec_t)__builtin_ia32_maskloadd256((vsi_t *)&(x), m_); \
+ __builtin_ia32_maskstored256((vsi_t *)(d), ~m_, (vsi_t)(y)); \
+})
+# define swap(x) ((vec_t)__builtin_ia32_permvarsi256((vsi_t)(x), (vsi_t)inv - 1))
+# elif INT_SIZE == 8 || UINT_SIZE == 8
+# define mix(x, y) ((vec_t)__builtin_ia32_pblendd256((vsi_t)(x), (vsi_t)(y), 0b11001100))
+# define rotr(x, n) ((vec_t)__builtin_ia32_palignr256(__builtin_ia32_permti256((vdi_t)(x), (vdi_t)(x), 0b00000001), \
+ (vdi_t)(x), (n) * 64))
+# define select(d, x, y, m) ({ \
+ vdi_t m_ = (vdi_t)(m); \
+ *(d) = (vec_t)__builtin_ia32_maskloadq256((vdi_t *)&(x), m_); \
+ __builtin_ia32_maskstoreq256((vdi_t *)(d), ~m_, (vdi_t)(y)); \
+})
+# define swap(x) ((vec_t)__builtin_ia32_permdi256((vdi_t)(x), 0b00011011))
+# define swap2(x) ({ \
+ vdi_t t_ = __builtin_ia32_permdi256((vdi_t)(x), 0b10110001); \
+ (vec_t)__builtin_ia32_permti256(t_, t_, 0b00000001); \
+})
+# endif
+# if INT_SIZE == 1
+# define abs(x) ((vec_t)__builtin_ia32_pabsb256((vqi_t)(x)))
+# define max(x, y) ((vec_t)__builtin_ia32_pmaxsb256((vqi_t)(x), (vqi_t)(y)))
+# define min(x, y) ((vec_t)__builtin_ia32_pminsb256((vqi_t)(x), (vqi_t)(y)))
+# define widen1(x) ((vec_t)__builtin_ia32_pmovsxbw256((vqi_t)(x)))
+# define widen2(x) ((vec_t)__builtin_ia32_pmovsxbd256((vqi_t)(x)))
+# define widen3(x) ((vec_t)__builtin_ia32_pmovsxbq256((vqi_t)(x)))
+# elif UINT_SIZE == 1
+# define max(x, y) ((vec_t)__builtin_ia32_pmaxub256((vqi_t)(x), (vqi_t)(y)))
+# define min(x, y) ((vec_t)__builtin_ia32_pminub256((vqi_t)(x), (vqi_t)(y)))
+# define widen1(x) ((vec_t)__builtin_ia32_pmovzxbw256((vqi_t)(x)))
+# define widen2(x) ((vec_t)__builtin_ia32_pmovzxbd256((vqi_t)(x)))
+# define widen3(x) ((vec_t)__builtin_ia32_pmovzxbq256((vqi_t)(x)))
+# elif INT_SIZE == 2
+# define abs(x) __builtin_ia32_pabsw256(x)
+# define max(x, y) __builtin_ia32_pmaxsw256(x, y)
+# define min(x, y) __builtin_ia32_pminsw256(x, y)
+# define mul_hi(x, y) __builtin_ia32_pmulhw256(x, y)
+# define widen1(x) ((vec_t)__builtin_ia32_pmovsxwd256(x))
+# define widen2(x) ((vec_t)__builtin_ia32_pmovsxwq256(x))
+# elif UINT_SIZE == 2
+# define max(x, y) ((vec_t)__builtin_ia32_pmaxuw256((vhi_t)(x), (vhi_t)(y)))
+# define min(x, y) ((vec_t)__builtin_ia32_pminuw256((vhi_t)(x), (vhi_t)(y)))
+# define mul_hi(x, y) ((vec_t)__builtin_ia32_pmulhuw256((vhi_t)(x), (vhi_t)(y)))
+# define widen1(x) ((vec_t)__builtin_ia32_pmovzxwd256((vhi_t)(x)))
+# define widen2(x) ((vec_t)__builtin_ia32_pmovzxwq256((vhi_t)(x)))
+# elif INT_SIZE == 4
+# define abs(x) __builtin_ia32_pabsd256(x)
+# define max(x, y) __builtin_ia32_pmaxsd256(x, y)
+# define min(x, y) __builtin_ia32_pminsd256(x, y)
+# define widen1(x) ((vec_t)__builtin_ia32_pmovsxdq256(x))
+# elif UINT_SIZE == 4
+# define max(x, y) ((vec_t)__builtin_ia32_pmaxud256((vsi_t)(x), (vsi_t)(y)))
+# define min(x, y) ((vec_t)__builtin_ia32_pminud256((vsi_t)(x), (vsi_t)(y)))
+# define mul_full(x, y) ((vec_t)__builtin_ia32_pmuludq256((vsi_t)(x), (vsi_t)(y)))
+# define widen1(x) ((vec_t)__builtin_ia32_pmovzxdq256((vsi_t)(x)))
+# elif INT_SIZE == 8
+# define broadcast(x) ({ \
+ long long s_ = (x); \
+ long long __attribute__((vector_size(16))) t_; \
+ vec_t d_; \
+ asm ( "vpbroadcastq %1,%0" : "=x" (t_) : "m" (s_)); \
+ asm ( "vbroadcasti128 %1,%0" : "=x" (d_) : "m" (t_)); \
+ d_; \
+})
+# elif UINT_SIZE == 8
+# define broadcast(x) ({ long long s_ = (x); vec_t d_; asm ( "vpbroadcastq %1,%0" : "=x" (d_) : "m" (s_)); d_; })
+# endif
#endif
#if VEC_SIZE == 16 && defined(__SSE3__)
# if FLOAT_SIZE == 4
# define addsub(x, y) __builtin_ia32_addsubps256(x, y)
# define dup_hi(x) __builtin_ia32_movshdup256(x)
# define dup_lo(x) __builtin_ia32_movsldup256(x)
-# define hadd(x, y) ({ \
+# ifdef __AVX2__
+# define hadd(x, y) __builtin_ia32_permvarsf256(__builtin_ia32_haddps256(x, y), \
+ (vsi_t){0, 1, 4, 5, 2, 3, 6, 7})
+# define hsub(x, y) __builtin_ia32_permvarsf256(__builtin_ia32_hsubps256(x, y), \
+ (vsi_t){0, 1, 4, 5, 2, 3, 6, 7})
+# else
+# define hadd(x, y) ({ \
vec_t t_ = __builtin_ia32_haddps256(x, y); \
(vec_t){t_[0], t_[1], t_[4], t_[5], t_[2], t_[3], t_[6], t_[7]}; \
})
-# define hsub(x, y) ({ \
+# define hsub(x, y) ({ \
vec_t t_ = __builtin_ia32_hsubps256(x, y); \
(vec_t){t_[0], t_[1], t_[4], t_[5], t_[2], t_[3], t_[6], t_[7]}; \
})
+# endif
# elif FLOAT_SIZE == 8
# define addsub(x, y) __builtin_ia32_addsubpd256(x, y)
# define dup_lo(x) __builtin_ia32_movddup256(x)
-# define hadd(x, y) ({ \
+# ifdef __AVX2__
+# define hadd(x, y) __builtin_ia32_permdf256(__builtin_ia32_haddpd256(x, y), 0b11011000)
+# define hsub(x, y) __builtin_ia32_permdf256(__builtin_ia32_hsubpd256(x, y), 0b11011000)
+# else
+# define hadd(x, y) ({ \
vec_t t_ = __builtin_ia32_haddpd256(x, y); \
(vec_t){t_[0], t_[2], t_[1], t_[3]}; \
})
-# define hsub(x, y) ({ \
+# define hsub(x, y) ({ \
vec_t t_ = __builtin_ia32_hsubpd256(x, y); \
(vec_t){t_[0], t_[2], t_[1], t_[3]}; \
})
+# endif
# endif
#endif
#if VEC_SIZE == 16 && defined(__SSSE3__)
z *= alt;
# endif
/*
- * Zap elements for which the shift count is negative (and the hence the
+ * Zap elements for which the shift count is zero (and the hence the
* decrement below would yield a negative count.
*/
z &= (sh > 0);
--sh;
touch(sh);
y = z << sh;
- touch(sh);
if ( !to_bool(x == y + y) ) return __LINE__;
+# if defined(__AVX2__) && ELEM_SIZE >= 4
+ touch(sh);
+ x = y >> sh;
+ if ( !to_bool(x == z) ) return __LINE__;
+# endif
+
# endif
#endif
#include "sse.h"
#include "sse2.h"
#include "sse4.h"
-#include "sse2-avx.h"
-#include "sse4-avx.h"
#include "avx.h"
#include "fma4.h"
#include "fma.h"
+#include "avx2.h"
#define verbose false /* Switch to true for far more logging. */
{
return cpu_has_avx;
}
-#define simd_check_sse2_avx simd_check_avx
-#define simd_check_sse4_avx simd_check_avx
static bool simd_check_fma4(void)
{
return cpu_has_fma;
}
+static bool simd_check_avx2(void)
+{
+ return cpu_has_avx2;
+}
+
static void simd_set_regs(struct cpu_user_regs *regs)
{
if ( cpu_has_mmx )
SIMD(SSE4 packed u32, sse4, 16u4),
SIMD(SSE4 packed s64, sse4, 16i8),
SIMD(SSE4 packed u64, sse4, 16u8),
- SIMD(SSE2/AVX packed s8, sse2_avx, 16i1),
- SIMD(SSE2/AVX packed u8, sse2_avx, 16u1),
- SIMD(SSE2/AVX packed s16, sse2_avx, 16i2),
- SIMD(SSE2/AVX packed u16, sse2_avx, 16u2),
- SIMD(SSE2/AVX packed s32, sse2_avx, 16i4),
- SIMD(SSE2/AVX packed u32, sse2_avx, 16u4),
- SIMD(SSE2/AVX packed s64, sse2_avx, 16i8),
- SIMD(SSE2/AVX packed u64, sse2_avx, 16u8),
- SIMD(SSE4/AVX packed s8, sse4_avx, 16i1),
- SIMD(SSE4/AVX packed u8, sse4_avx, 16u1),
- SIMD(SSE4/AVX packed s16, sse4_avx, 16i2),
- SIMD(SSE4/AVX packed u16, sse4_avx, 16u2),
- SIMD(SSE4/AVX packed s32, sse4_avx, 16i4),
- SIMD(SSE4/AVX packed u32, sse4_avx, 16u4),
- SIMD(SSE4/AVX packed s64, sse4_avx, 16i8),
- SIMD(SSE4/AVX packed u64, sse4_avx, 16u8),
SIMD(AVX scalar single, avx, f4),
SIMD(AVX 128bit single, avx, 16f4),
SIMD(AVX 256bit single, avx, 32f4),
SIMD(FMA scalar double, fma, f8),
SIMD(FMA 128bit double, fma, 16f8),
SIMD(FMA 256bit double, fma, 32f8),
+ SIMD(AVX2 128bit single, avx2, 16f4),
+ SIMD(AVX2 256bit single, avx2, 32f4),
+ SIMD(AVX2 128bit double, avx2, 16f8),
+ SIMD(AVX2 256bit double, avx2, 32f8),
+ SIMD(AVX2 s8x16, avx2, 16i1),
+ SIMD(AVX2 u8x16, avx2, 16u1),
+ SIMD(AVX2 s16x8, avx2, 16i2),
+ SIMD(AVX2 u16x8, avx2, 16u2),
+ SIMD(AVX2 s32x4, avx2, 16i4),
+ SIMD(AVX2 u32x4, avx2, 16u4),
+ SIMD(AVX2 s64x2, avx2, 16i8),
+ SIMD(AVX2 u64x2, avx2, 16u8),
+ SIMD(AVX2 s8x32, avx2, 32i1),
+ SIMD(AVX2 u8x32, avx2, 32u1),
+ SIMD(AVX2 s16x16, avx2, 32i2),
+ SIMD(AVX2 u16x16, avx2, 32u2),
+ SIMD(AVX2 s32x8, avx2, 32i4),
+ SIMD(AVX2 u32x8, avx2, 32u4),
+ SIMD(AVX2 s64x4, avx2, 32i8),
+ SIMD(AVX2 u64x4, avx2, 32u8),
#undef SIMD_
#undef SIMD
};
else
printf("skipped\n");
+ printf("%-40s", "Testing vpmaskmovd %xmm1,%xmm2,(%edx)...");
+ if ( stack_exec && cpu_has_avx2 )
+ {
+ decl_insn(vpmaskmovd);
+
+ asm volatile ( "vpxor %%xmm1, %%xmm1, %%xmm1\n\t"
+ "vpinsrd $0b00, %1, %%xmm1, %%xmm2\n\t"
+#if 0 /* Don't use AVX2 instructions for now */
+ put_insn(vpmaskmovd, "vpmaskmovd %%xmm1, %%xmm2, (%0)")
+#else
+ put_insn(vpmaskmovd,
+ ".byte 0xc4, 0xe2, 0x69, 0x8e, 0x0a")
+#endif
+ :: "d" (NULL), "r" (~0) );
+
+ memset(res + MMAP_SZ / sizeof(*res) - 8, 0xdb, 32);
+ set_insn(vpmaskmovd);
+ regs.edx = (unsigned long)res + MMAP_SZ - 4;
+ rc = x86_emulate(&ctxt, &emulops);
+ if ( rc != X86EMUL_OKAY || !check_eip(vpmaskmovd) ||
+ res[MMAP_SZ / sizeof(*res) - 1] ||
+ memcmp(res + MMAP_SZ / sizeof(*res) - 8,
+ res + MMAP_SZ / sizeof(*res) - 4, 12) )
+ goto fail;
+
+ asm volatile ( "vpinsrd $0b11, %0, %%xmm1, %%xmm2" :: "r" (~0) );
+ memset(res, 0xdb, 32);
+ set_insn(vpmaskmovd);
+ regs.edx = (unsigned long)(res - 3);
+ rc = x86_emulate(&ctxt, &emulops);
+ if ( rc != X86EMUL_OKAY || !check_eip(vpmaskmovd) ||
+ res[0] || memcmp(res + 1, res + 4, 12) )
+ goto fail;
+
+ printf("okay\n");
+ }
+ else
+ printf("skipped\n");
+
+ printf("%-40s", "Testing vpmaskmovq %xmm1,%xmm2,(%edx)...");
+ if ( stack_exec && cpu_has_avx2 )
+ {
+ decl_insn(vpmaskmovq);
+
+ asm volatile ( "vpxor %%xmm1, %%xmm1, %%xmm1\n\t"
+ "vpcmpeqd %%xmm0, %%xmm0, %%xmm0\n\t"
+#if 0 /* Don't use AVX2 instructions for now */
+ "vpblendd $0b0011, %%xmm0, %%xmm1, %%xmm2\n\t"
+ put_insn(vpmaskmovq, "vpmaskmovq %%xmm1, %%xmm2, (%0)")
+#else
+ ".byte 0xc4, 0xe3, 0x71, 0x02, 0xd0, 0b0011\n\t"
+ put_insn(vpmaskmovq,
+ ".byte 0xc4, 0xe2, 0xe9, 0x8e, 0x0a")
+#endif
+ :: "d" (NULL) );
+
+ memset(res + MMAP_SZ / sizeof(*res) - 8, 0xdb, 32);
+ set_insn(vpmaskmovq);
+ regs.edx = (unsigned long)res + MMAP_SZ - 8;
+ rc = x86_emulate(&ctxt, &emulops);
+ if ( rc != X86EMUL_OKAY || !check_eip(vpmaskmovq) ||
+ res[MMAP_SZ / sizeof(*res) - 1] ||
+ res[MMAP_SZ / sizeof(*res) - 2] ||
+ memcmp(res + MMAP_SZ / sizeof(*res) - 8,
+ res + MMAP_SZ / sizeof(*res) - 4, 8) )
+ goto fail;
+
+#if 0 /* Don't use AVX2 instructions for now */
+ asm volatile ( "vpermq $0b00000001, %ymm2, %ymm2" );
+#else
+ asm volatile ( ".byte 0xc4, 0xe3, 0xfd, 0x00, 0xd2, 0b00000001" );
+#endif
+ memset(res, 0xdb, 32);
+ set_insn(vpmaskmovq);
+ regs.edx = (unsigned long)(res - 2);
+ rc = x86_emulate(&ctxt, &emulops);
+ if ( rc != X86EMUL_OKAY || !check_eip(vpmaskmovq) ||
+ res[0] || res[1] || memcmp(res + 2, res + 4, 8) )
+ goto fail;
+
+ printf("okay\n");
+ }
+ else
+ printf("skipped\n");
+
printf("%-40s", "Testing stmxcsr (%edx)...");
if ( cpu_has_sse )
{
[0x0c ... 0x0f] = { .simd_size = simd_packed_fp },
[0x10] = { .simd_size = simd_packed_int },
[0x13] = { .simd_size = simd_other, .two_op = 1 },
- [0x14 ... 0x15] = { .simd_size = simd_packed_fp },
+ [0x14 ... 0x16] = { .simd_size = simd_packed_fp },
[0x17] = { .simd_size = simd_packed_int, .two_op = 1 },
[0x18 ... 0x19] = { .simd_size = simd_scalar_fp, .two_op = 1 },
[0x1a] = { .simd_size = simd_128, .two_op = 1 },
[0x2c ... 0x2d] = { .simd_size = simd_other },
[0x2e ... 0x2f] = { .simd_size = simd_other, .to_mem = 1 },
[0x30 ... 0x35] = { .simd_size = simd_other, .two_op = 1 },
- [0x37 ... 0x3f] = { .simd_size = simd_packed_int },
+ [0x36 ... 0x3f] = { .simd_size = simd_packed_int },
[0x40] = { .simd_size = simd_packed_int },
[0x41] = { .simd_size = simd_packed_int, .two_op = 1 },
+ [0x45 ... 0x47] = { .simd_size = simd_packed_int },
+ [0x58 ... 0x59] = { .simd_size = simd_other, .two_op = 1 },
+ [0x5a] = { .simd_size = simd_128, .two_op = 1 },
+ [0x78 ... 0x79] = { .simd_size = simd_other, .two_op = 1 },
+ [0x8c] = { .simd_size = simd_other },
+ [0x8e] = { .simd_size = simd_other, .to_mem = 1 },
[0x96 ... 0x9f] = { .simd_size = simd_packed_fp },
[0xa6 ... 0xaf] = { .simd_size = simd_packed_fp },
[0xb6 ... 0xbf] = { .simd_size = simd_packed_fp },
uint8_t two_op:1;
uint8_t four_op:1;
} ext0f3a_table[256] = {
+ [0x00] = { .simd_size = simd_packed_int, .two_op = 1 },
+ [0x01] = { .simd_size = simd_packed_fp, .two_op = 1 },
+ [0x02] = { .simd_size = simd_packed_int },
[0x04 ... 0x05] = { .simd_size = simd_packed_fp, .two_op = 1 },
[0x06] = { .simd_size = simd_packed_fp },
[0x08 ... 0x09] = { .simd_size = simd_packed_fp, .two_op = 1 },
[0x20] = { .simd_size = simd_none },
[0x21] = { .simd_size = simd_other },
[0x22] = { .simd_size = simd_none },
+ [0x38] = { .simd_size = simd_128 },
+ [0x39] = { .simd_size = simd_128, .to_mem = 1, .two_op = 1 },
[0x40 ... 0x41] = { .simd_size = simd_packed_fp },
[0x42] = { .simd_size = simd_packed_int },
[0x44] = { .simd_size = simd_packed_int },
+ [0x46] = { .simd_size = simd_packed_int },
[0x4a ... 0x4b] = { .simd_size = simd_packed_fp, .four_op = 1 },
[0x4c] = { .simd_size = simd_packed_int, .four_op = 1 },
[0x5c ... 0x5f] = { .simd_size = simd_packed_fp, .four_op = 1 },
case X86EMUL_OPC_VEX_66(0x0f38, 0x40): /* vpmulld {x,y}mm/mem,{x,y}mm,{x,y}mm */
if ( !vex.l )
goto simd_0f_avx;
+ /* fall through */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x45): /* vpsrlv{d,q} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x47): /* vpsllv{d,q} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ simd_0f_avx2:
host_and_vcpu_must_have(avx2);
goto simd_0f_ymm;
}
case X86EMUL_OPC_VEX_66(0x0f3a, 0x0f): /* vpalignr $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */
case X86EMUL_OPC_VEX_66(0x0f3a, 0x42): /* vmpsadbw $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */
if ( vex.l )
+ {
+ simd_0f_imm8_avx2:
host_and_vcpu_must_have(avx2);
+ }
else
{
case X86EMUL_OPC_VEX_66(0x0f3a, 0x08): /* vroundps $imm8,{x,y}mm/mem,{x,y}mm */
op_bytes = 8 << vex.l;
goto simd_0f_ymm;
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x16): /* vpermps ymm/m256,ymm,ymm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x36): /* vpermd ymm/m256,ymm,ymm */
+ generate_exception_if(!vex.l || vex.w, EXC_UD);
+ goto simd_0f_avx2;
+
case X86EMUL_OPC_VEX_66(0x0f38, 0x20): /* vpmovsxbw xmm/mem,{x,y}mm */
case X86EMUL_OPC_VEX_66(0x0f38, 0x21): /* vpmovsxbd xmm/mem,{x,y}mm */
case X86EMUL_OPC_VEX_66(0x0f38, 0x22): /* vpmovsxbq xmm/mem,{x,y}mm */
generate_exception_if(vex.l, EXC_UD);
goto simd_0f_avx;
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x58): /* vpbroadcastd xmm/m32,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x59): /* vpbroadcastq xmm/m64,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x78): /* vpbroadcastb xmm/m8,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x79): /* vpbroadcastw xmm/m16,{x,y}mm */
+ op_bytes = 1 << ((!(b & 0x20) * 2) + (b & 1));
+ /* fall through */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x46): /* vpsravd {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ generate_exception_if(vex.w, EXC_UD);
+ goto simd_0f_avx2;
+
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x5a): /* vbroadcasti128 m128,ymm */
+ generate_exception_if(ea.type != OP_MEM || !vex.l || vex.w, EXC_UD);
+ goto simd_0f_avx2;
+
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x8c): /* vpmaskmov{d,q} mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x8e): /* vpmaskmov{d,q} {x,y}mm,{x,y}mm,mem */
+ {
+ typeof(vex) *pvex;
+ unsigned int mask = vex.w ? 0x80808080U : 0x88888888U;
+
+ generate_exception_if(ea.type != OP_MEM, EXC_UD);
+ host_and_vcpu_must_have(avx2);
+ get_fpu(X86EMUL_FPU_ymm, &fic);
+
+ /*
+ * While we can't reasonably provide fully correct behavior here
+ * (in particular, for writes, avoiding the memory read in anticipation
+ * of all elements in the range eventually being written), we can (and
+ * should) still limit the memory access to the smallest possible range
+ * (suppressing it altogether if all mask bits are clear), to provide
+ * correct faulting behavior. Read the mask bits via vmovmskp{s,d}
+ * for that purpose.
+ */
+ opc = init_prefixes(stub);
+ pvex = copy_VEX(opc, vex);
+ pvex->opcx = vex_0f;
+ opc[0] = 0xd7; /* vpmovmskb */
+ /* Use %rax as GPR destination and VEX.vvvv as source. */
+ pvex->r = 1;
+ pvex->b = !mode_64bit() || (vex.reg >> 3);
+ opc[1] = 0xc0 | (~vex.reg & 7);
+ pvex->reg = 0xf;
+ opc[2] = 0xc3;
+
+ invoke_stub("", "", "=a" (ea.val) : [dummy] "i" (0));
+ put_stub(stub);
+
+ /* Convert byte granular result to dword/qword granularity. */
+ ea.val &= mask;
+ if ( !ea.val )
+ goto complete_insn;
+
+ first_byte = __builtin_ctz(ea.val) & ~((4 << vex.w) - 1);
+ ea.val >>= first_byte;
+ op_bytes = 32 - __builtin_clz(ea.val);
+
+ /*
+ * Even for the memory write variant a memory read is needed, unless
+ * all set mask bits are contiguous.
+ */
+ if ( ea.val & (ea.val + ~mask + 1) )
+ d = (d & ~SrcMask) | SrcMem;
+
+ opc = init_prefixes(stub);
+ opc[0] = b;
+ /* Convert memory operand to (%rAX). */
+ rex_prefix &= ~REX_B;
+ vex.b = 1;
+ opc[1] = modrm & 0x38;
+ fic.insn_bytes = PFX_BYTES + 2;
+
+ break;
+ }
+
case X86EMUL_OPC_VEX_66(0x0f38, 0x96): /* vfmaddsub132p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
case X86EMUL_OPC_VEX_66(0x0f38, 0x97): /* vfmsubadd132p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
case X86EMUL_OPC_VEX_66(0x0f38, 0x98): /* vfmadd132p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
: "0" ((uint32_t)src.val), "rm" (_regs.edx) );
break;
+ case X86EMUL_OPC_VEX_66(0x0f3a, 0x00): /* vpermq $imm8,ymm/m256,ymm */
+ case X86EMUL_OPC_VEX_66(0x0f3a, 0x01): /* vpermpd $imm8,ymm/m256,ymm */
+ generate_exception_if(!vex.l || !vex.w, EXC_UD);
+ goto simd_0f_imm8_avx2;
+
+ case X86EMUL_OPC_VEX_66(0x0f3a, 0x38): /* vinserti128 $imm8,xmm/m128,ymm,ymm */
+ case X86EMUL_OPC_VEX_66(0x0f3a, 0x39): /* vextracti128 $imm8,ymm,xmm/m128 */
+ case X86EMUL_OPC_VEX_66(0x0f3a, 0x46): /* vperm2i128 $imm8,ymm/m256,ymm,ymm */
+ generate_exception_if(!vex.l, EXC_UD);
+ /* fall through */
+ case X86EMUL_OPC_VEX_66(0x0f3a, 0x02): /* vpblendd $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+ generate_exception_if(vex.w, EXC_UD);
+ goto simd_0f_imm8_avx2;
+
case X86EMUL_OPC_VEX_66(0x0f3a, 0x06): /* vperm2f128 $imm8,ymm/m256,ymm,ymm */
case X86EMUL_OPC_VEX_66(0x0f3a, 0x18): /* vinsertf128 $imm8,xmm/m128,ymm,ymm */
case X86EMUL_OPC_VEX_66(0x0f3a, 0x19): /* vextractf128 $imm8,ymm,xmm/m128 */
{
case X86EMUL_OPC_VEX_66(0x0f38, 0x2e): /* vmaskmovps */
case X86EMUL_OPC_VEX_66(0x0f38, 0x2f): /* vmaskmovpd */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x8e): /* vpmaskmov{d,q} */
/* These have merge semantics; force write to occur. */
d |= Mov;
break;