x86emul: support most remaining AVX2 insns

author Jan Beulich <jbeulich@suse.com>

Mon, 5 Mar 2018 15:21:49 +0000 (16:21 +0100)

committer Jan Beulich <jbeulich@suse.com>

Mon, 5 Mar 2018 15:21:49 +0000 (16:21 +0100)
author Jan Beulich <jbeulich@suse.com>
Mon, 5 Mar 2018 15:21:49 +0000 (16:21 +0100)
committer Jan Beulich <jbeulich@suse.com>
Mon, 5 Mar 2018 15:21:49 +0000 (16:21 +0100)
diff --git a/tools/tests/x86_emulator/Makefile b/tools/tests/x86_emulator/Makefile

index 9c4352d8727e6df4edfd93dcf6f02736ec7d3cc5..7e127ee588ea64f0b493c47a0b1648ffd279aaf7 100644 (file)
--- a/tools/tests/x86_emulator/Makefile
+++ b/tools/tests/x86_emulator/Makefile
@@ -11,9 +11,9 @@ all: $(TARGET)
  run: $(TARGET)
         ./$(TARGET)
  
-SIMD := sse sse2 sse4 avx
+SIMD := sse sse2 sse4 avx avx2
  FMA := fma4 fma
-TESTCASES := blowfish $(SIMD) sse2-avx sse4-avx $(FMA)
+TESTCASES := blowfish $(SIMD) $(FMA)
  
  blowfish-cflags := ""
  blowfish-cflags-x86_32 := "-mno-accumulate-outgoing-args -Dstatic="
@@ -36,13 +36,9 @@ fma4-flts := $(avx-flts)
  fma-vecs := $(avx-vecs)
  fma-ints :=
  fma-flts := $(avx-flts)
-
-# When converting SSE to AVX, have the compiler avoid XMM0 to widen
-# coverage of the VEX.vvvv checks in the emulator. We must not do this,
-# however, for SSE4.1 and later, as there are instructions with XMM0 as
-# an implicit operand.
-sse2avx-sse2 := -ffixed-xmm0 -Wa,-msse2avx
-sse2avx-sse4 := -Wa,-msse2avx
+avx2-vecs := $(avx-vecs)
+avx2-ints := 1 2 4 8
+avx2-flts := 4 8
  
  # For AVX and later, have the compiler avoid XMM0 to widen coverage of
  # the VEX.vvvv checks in the emulator.
@@ -58,11 +54,6 @@ $(1)-cflags := \
             "-D_$(vec)f$(flt) -m$(1) $(call non-sse,$(1)) -Os -DVEC_SIZE=$(vec) -DFLOAT_SIZE=$(flt)")) \
         $(foreach flt,$($(1)-flts), \
           "-D_f$(flt) -m$(1) $(call non-sse,$(1)) -mfpmath=sse -Os -DFLOAT_SIZE=$(flt)")
-$(1)-avx-cflags := \
-       $(foreach vec,$($(1)-vecs), \
-         $(foreach int,$($(1)-ints), \
-           "-D_$(vec)i$(int) -m$(1) $(sse2avx-$(1)) -Os -DVEC_SIZE=$(vec) -DINT_SIZE=$(int)" \
-           "-D_$(vec)u$(int) -m$(1) $(sse2avx-$(1)) -Os -DVEC_SIZE=$(vec) -DUINT_SIZE=$(int)"))
  endef
  
  $(foreach flavor,$(SIMD) $(FMA),$(eval $(call simd-defs,$(flavor))))
@@ -81,13 +72,13 @@ $(addsuffix .h,$(TESTCASES)): %.h: %.c testcase.mk Makefile
         )
         mv $@.new $@
  
-$(addsuffix .c,$(SIMD)) $(addsuffix -avx.c,$(filter sse%,$(SIMD))):
+$(addsuffix .c,$(SIMD)):
         ln -sf simd.c $@
  
  $(addsuffix .c,$(FMA)):
         ln -sf simd-fma.c $@
  
-$(addsuffix .o,$(SIMD) $(FMA)) $(addsuffix -avx.o,$(filter sse%,$(SIMD))): simd.h
+$(addsuffix .o,$(SIMD) $(FMA)): simd.h
  
  $(TARGET): x86-emulate.o test_x86_emulator.o
         $(HOSTCC) $(HOSTCFLAGS) -o $@ $^
diff --git a/tools/tests/x86_emulator/simd.c b/tools/tests/x86_emulator/simd.c

index 81a9396b5d30ac31b747ee05c855469e90127cb1..0d53a620c647434143ba11ad1d03582a44620576 100644 (file)
--- a/tools/tests/x86_emulator/simd.c
+++ b/tools/tests/x86_emulator/simd.c
@@ -23,7 +23,9 @@ ENTRY(simd_test);
  #  endif
  # endif
  #elif VEC_SIZE == 32
-# if defined(__AVX__) && ELEM_SIZE == 4
+# if defined(__AVX2__)
+#  define to_bool(cmp) __builtin_ia32_ptestc256(cmp, (vdi_t){} == 0)
+# elif defined(__AVX__) && ELEM_SIZE == 4
  #  define to_bool(cmp) (__builtin_ia32_movmskps256(cmp) == 0xff)
  # elif defined(__AVX__) && ELEM_SIZE == 8
  #  define to_bool(cmp) (__builtin_ia32_movmskpd256(cmp) == 0xf)
@@ -85,10 +87,14 @@ static inline bool _to_bool(byte_vec_t bv)
      vec_t t_ = __builtin_ia32_vpermilps256(x, 0b00011011); \
      __builtin_ia32_vperm2f128_ps256(t_, t_, 0b00000001); \
  })
-#  define swap2(x) ({ \
-    vec_t t_ = __builtin_ia32_vpermilvarps256(x, __builtin_ia32_cvtps2dq256(inv) - 1); \
-    __builtin_ia32_vperm2f128_ps256(t_, t_, 0b00000001); \
+#  ifdef __AVX2__
+#   define swap2(x) __builtin_ia32_permvarsf256(x, __builtin_ia32_cvtps2dq256(inv) - 1)
+#  else
+#   define swap2(x) ({ \
+        vec_t t_ = __builtin_ia32_vpermilvarps256(x, __builtin_ia32_cvtps2dq256(inv) - 1); \
+        __builtin_ia32_vperm2f128_ps256(t_, t_, 0b00000001); \
  })
+#  endif
  # elif VEC_SIZE == 16
  #  if defined(__AVX2__)
  #   define broadcast(x) __builtin_ia32_vbroadcastss_ps((vec_t){ x })
@@ -140,6 +146,9 @@ static inline bool _to_bool(byte_vec_t bv)
      vec_t t_ = __builtin_ia32_vpermilpd256(x, 0b00000101); \
      __builtin_ia32_vperm2f128_pd256(t_, t_, 0b00000001); \
  })
+#  ifdef __AVX2__
+#   define swap2(x) __builtin_ia32_permdf256(x, 0b00011011)
+#  endif
  # elif VEC_SIZE == 16
  #  define interleave_hi(x, y) __builtin_ia32_unpckhpd(x, y)
  #  define interleave_lo(x, y) __builtin_ia32_unpcklpd(x, y)
@@ -196,6 +205,104 @@ static inline bool _to_bool(byte_vec_t bv)
      __builtin_ia32_maskmovdqu((vqi_t)(x),  m_, d_); \
      __builtin_ia32_maskmovdqu((vqi_t)(y), ~m_, d_); \
  })
+#elif VEC_SIZE == 32 && defined(__AVX2__)
+# define swap_lanes(x, y, func, type) ({ \
+    long long __attribute__((vector_size(16))) t_ = __builtin_ia32_extract128i256((vdi_t)(y), 0); \
+    type t1_ = (type)__builtin_ia32_insert128i256((vdi_t)(x), t_, 1), t2_; \
+    t_ = __builtin_ia32_extract128i256((vdi_t)(x), 1); \
+    t2_ = (type)__builtin_ia32_insert128i256((vdi_t)(y), t_, 0); \
+    func(t1_, t2_); \
+})
+# if INT_SIZE == 1 || UINT_SIZE == 1
+#  define broadcast(x) ({ char s_ = (x); vec_t d_; asm ( "vpbroadcastb %1,%0" : "=x" (d_) : "m" (s_)); d_; })
+#  define copysignz(x, y) ((vec_t)__builtin_ia32_psignb256((vqi_t)(x), (vqi_t)(y)))
+#  define rotr(x, n) ((vec_t)__builtin_ia32_palignr256(__builtin_ia32_permti256((vdi_t)(x), (vdi_t)(x), 0b00000001), \
+                                                       (vdi_t)(x), (n) * 8))
+# elif INT_SIZE == 2 || UINT_SIZE == 2
+#  define broadcast(x) ({ short s_ = (x); vec_t d_; asm ( "vpbroadcastw %1,%0" : "=x" (d_) : "m" (s_)); d_; })
+#  define copysignz(x, y) ((vec_t)__builtin_ia32_psignw256((vhi_t)(x), (vhi_t)(y)))
+#  define hadd(x, y) ((vec_t)swap_lanes(x, y, __builtin_ia32_phaddw256, vhi_t))
+#  define hsub(x, y) ((vec_t)swap_lanes(x, y, __builtin_ia32_phsubw256, vhi_t))
+#  define mix(x, y) ((vec_t)__builtin_ia32_pblendw256((vhi_t)(x), (vhi_t)(y), 0b10101010))
+#  define rotr(x, n) ((vec_t)__builtin_ia32_palignr256(__builtin_ia32_permti256((vdi_t)(x), (vdi_t)(x), 0b00000001), \
+                                                       (vdi_t)(x), (n) * 16))
+# elif INT_SIZE == 4 || UINT_SIZE == 4
+#  define broadcast(x) ({ int s_ = (x); vec_t d_; asm ( "vpbroadcastd %1,%0" : "=x" (d_) : "m" (s_)); d_; })
+#  define copysignz(x, y) ((vec_t)__builtin_ia32_psignd256((vsi_t)(x), (vsi_t)(y)))
+#  define hadd(x, y) ((vec_t)swap_lanes(x, y, __builtin_ia32_phaddd256, vsi_t))
+#  define hsub(x, y) ((vec_t)swap_lanes(x, y, __builtin_ia32_phsubd256, vsi_t))
+#  define mix(x, y) ((vec_t)__builtin_ia32_pblendd256((vsi_t)(x), (vsi_t)(y), 0b10101010))
+#  define rotr(x, n) ((vec_t)__builtin_ia32_palignr256(__builtin_ia32_permti256((vdi_t)(x), (vdi_t)(x), 0b00000001), \
+                                                       (vdi_t)(x), (n) * 32))
+#  define select(d, x, y, m) ({ \
+    vsi_t m_ = (vsi_t)(m); \
+    *(d) = (vec_t)__builtin_ia32_maskloadd256((vsi_t *)&(x),  m_); \
+    __builtin_ia32_maskstored256((vsi_t *)(d), ~m_, (vsi_t)(y)); \
+})
+#  define swap(x) ((vec_t)__builtin_ia32_permvarsi256((vsi_t)(x), (vsi_t)inv - 1))
+# elif INT_SIZE == 8 || UINT_SIZE == 8
+#  define mix(x, y) ((vec_t)__builtin_ia32_pblendd256((vsi_t)(x), (vsi_t)(y), 0b11001100))
+#  define rotr(x, n) ((vec_t)__builtin_ia32_palignr256(__builtin_ia32_permti256((vdi_t)(x), (vdi_t)(x), 0b00000001), \
+                                                       (vdi_t)(x), (n) * 64))
+#  define select(d, x, y, m) ({ \
+    vdi_t m_ = (vdi_t)(m); \
+    *(d) = (vec_t)__builtin_ia32_maskloadq256((vdi_t *)&(x),  m_); \
+    __builtin_ia32_maskstoreq256((vdi_t *)(d), ~m_, (vdi_t)(y)); \
+})
+#  define swap(x) ((vec_t)__builtin_ia32_permdi256((vdi_t)(x), 0b00011011))
+#  define swap2(x) ({ \
+    vdi_t t_ = __builtin_ia32_permdi256((vdi_t)(x), 0b10110001); \
+    (vec_t)__builtin_ia32_permti256(t_, t_, 0b00000001); \
+})
+# endif
+# if INT_SIZE == 1
+#  define abs(x) ((vec_t)__builtin_ia32_pabsb256((vqi_t)(x)))
+#  define max(x, y) ((vec_t)__builtin_ia32_pmaxsb256((vqi_t)(x), (vqi_t)(y)))
+#  define min(x, y) ((vec_t)__builtin_ia32_pminsb256((vqi_t)(x), (vqi_t)(y)))
+#  define widen1(x) ((vec_t)__builtin_ia32_pmovsxbw256((vqi_t)(x)))
+#  define widen2(x) ((vec_t)__builtin_ia32_pmovsxbd256((vqi_t)(x)))
+#  define widen3(x) ((vec_t)__builtin_ia32_pmovsxbq256((vqi_t)(x)))
+# elif UINT_SIZE == 1
+#  define max(x, y) ((vec_t)__builtin_ia32_pmaxub256((vqi_t)(x), (vqi_t)(y)))
+#  define min(x, y) ((vec_t)__builtin_ia32_pminub256((vqi_t)(x), (vqi_t)(y)))
+#  define widen1(x) ((vec_t)__builtin_ia32_pmovzxbw256((vqi_t)(x)))
+#  define widen2(x) ((vec_t)__builtin_ia32_pmovzxbd256((vqi_t)(x)))
+#  define widen3(x) ((vec_t)__builtin_ia32_pmovzxbq256((vqi_t)(x)))
+# elif INT_SIZE == 2
+#  define abs(x) __builtin_ia32_pabsw256(x)
+#  define max(x, y) __builtin_ia32_pmaxsw256(x, y)
+#  define min(x, y) __builtin_ia32_pminsw256(x, y)
+#  define mul_hi(x, y) __builtin_ia32_pmulhw256(x, y)
+#  define widen1(x) ((vec_t)__builtin_ia32_pmovsxwd256(x))
+#  define widen2(x) ((vec_t)__builtin_ia32_pmovsxwq256(x))
+# elif UINT_SIZE == 2
+#  define max(x, y) ((vec_t)__builtin_ia32_pmaxuw256((vhi_t)(x), (vhi_t)(y)))
+#  define min(x, y) ((vec_t)__builtin_ia32_pminuw256((vhi_t)(x), (vhi_t)(y)))
+#  define mul_hi(x, y) ((vec_t)__builtin_ia32_pmulhuw256((vhi_t)(x), (vhi_t)(y)))
+#  define widen1(x) ((vec_t)__builtin_ia32_pmovzxwd256((vhi_t)(x)))
+#  define widen2(x) ((vec_t)__builtin_ia32_pmovzxwq256((vhi_t)(x)))
+# elif INT_SIZE == 4
+#  define abs(x) __builtin_ia32_pabsd256(x)
+#  define max(x, y) __builtin_ia32_pmaxsd256(x, y)
+#  define min(x, y) __builtin_ia32_pminsd256(x, y)
+#  define widen1(x) ((vec_t)__builtin_ia32_pmovsxdq256(x))
+# elif UINT_SIZE == 4
+#  define max(x, y) ((vec_t)__builtin_ia32_pmaxud256((vsi_t)(x), (vsi_t)(y)))
+#  define min(x, y) ((vec_t)__builtin_ia32_pminud256((vsi_t)(x), (vsi_t)(y)))
+#  define mul_full(x, y) ((vec_t)__builtin_ia32_pmuludq256((vsi_t)(x), (vsi_t)(y)))
+#  define widen1(x) ((vec_t)__builtin_ia32_pmovzxdq256((vsi_t)(x)))
+# elif INT_SIZE == 8
+#  define broadcast(x) ({ \
+    long long s_ = (x); \
+    long long __attribute__((vector_size(16))) t_; \
+    vec_t d_; \
+    asm ( "vpbroadcastq %1,%0" : "=x" (t_) : "m" (s_)); \
+    asm ( "vbroadcasti128 %1,%0" : "=x" (d_) : "m" (t_)); \
+    d_; \
+})
+# elif UINT_SIZE == 8
+#  define broadcast(x) ({ long long s_ = (x); vec_t d_; asm ( "vpbroadcastq %1,%0" : "=x" (d_) : "m" (s_)); d_; })
+# endif
  #endif
  #if VEC_SIZE == 16 && defined(__SSE3__)
  # if FLOAT_SIZE == 4
@@ -219,25 +326,37 @@ static inline bool _to_bool(byte_vec_t bv)
  #  define addsub(x, y) __builtin_ia32_addsubps256(x, y)
  #  define dup_hi(x) __builtin_ia32_movshdup256(x)
  #  define dup_lo(x) __builtin_ia32_movsldup256(x)
-#  define hadd(x, y) ({ \
+#  ifdef __AVX2__
+#   define hadd(x, y) __builtin_ia32_permvarsf256(__builtin_ia32_haddps256(x, y), \
+                                                  (vsi_t){0, 1, 4, 5, 2, 3, 6, 7})
+#   define hsub(x, y) __builtin_ia32_permvarsf256(__builtin_ia32_hsubps256(x, y), \
+                                                  (vsi_t){0, 1, 4, 5, 2, 3, 6, 7})
+#  else
+#   define hadd(x, y) ({ \
          vec_t t_ = __builtin_ia32_haddps256(x, y); \
          (vec_t){t_[0], t_[1], t_[4], t_[5], t_[2], t_[3], t_[6], t_[7]}; \
  })
-#  define hsub(x, y) ({ \
+#   define hsub(x, y) ({ \
          vec_t t_ = __builtin_ia32_hsubps256(x, y); \
          (vec_t){t_[0], t_[1], t_[4], t_[5], t_[2], t_[3], t_[6], t_[7]}; \
  })
+#  endif
  # elif FLOAT_SIZE == 8
  #  define addsub(x, y) __builtin_ia32_addsubpd256(x, y)
  #  define dup_lo(x) __builtin_ia32_movddup256(x)
-#  define hadd(x, y) ({ \
+#  ifdef __AVX2__
+#   define hadd(x, y) __builtin_ia32_permdf256(__builtin_ia32_haddpd256(x, y), 0b11011000)
+#   define hsub(x, y) __builtin_ia32_permdf256(__builtin_ia32_hsubpd256(x, y), 0b11011000)
+#  else
+#   define hadd(x, y) ({ \
          vec_t t_ = __builtin_ia32_haddpd256(x, y); \
          (vec_t){t_[0], t_[2], t_[1], t_[3]}; \
  })
-#  define hsub(x, y) ({ \
+#   define hsub(x, y) ({ \
          vec_t t_ = __builtin_ia32_hsubpd256(x, y); \
          (vec_t){t_[0], t_[2], t_[1], t_[3]}; \
  })
+#  endif
  # endif
  #endif
  #if VEC_SIZE == 16 && defined(__SSSE3__)
@@ -558,7 +677,7 @@ int simd_test(void)
      z *= alt;
  #  endif
      /*
-     * Zap elements for which the shift count is negative (and the hence the
+     * Zap elements for which the shift count is zero (and the hence the
       * decrement below would yield a negative count.
       */
      z &= (sh > 0);
@@ -568,9 +687,14 @@ int simd_test(void)
      --sh;
      touch(sh);
      y = z << sh;
-    touch(sh);
      if ( !to_bool(x == y + y) ) return __LINE__;
  
+#  if defined(__AVX2__) && ELEM_SIZE >= 4
+    touch(sh);
+    x = y >> sh;
+    if ( !to_bool(x == z) ) return __LINE__;
+#  endif
+
  # endif
  
  #endif
diff --git a/tools/tests/x86_emulator/test_x86_emulator.c b/tools/tests/x86_emulator/test_x86_emulator.c

index 628a382e920dda727e0a2ffd11829e54e2895f1a..ac9bbc346f23196d64414cfe6677096d44577fa2 100644 (file)
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -8,11 +8,10 @@
  #include "sse.h"
  #include "sse2.h"
  #include "sse4.h"
-#include "sse2-avx.h"
-#include "sse4-avx.h"
  #include "avx.h"
  #include "fma4.h"
  #include "fma.h"
+#include "avx2.h"
  
  #define verbose false /* Switch to true for far more logging. */
  
@@ -46,8 +45,6 @@ static bool simd_check_avx(void)
  {
      return cpu_has_avx;
  }
-#define simd_check_sse2_avx  simd_check_avx
-#define simd_check_sse4_avx  simd_check_avx
  
  static bool simd_check_fma4(void)
  {
@@ -59,6 +56,11 @@ static bool simd_check_fma(void)
      return cpu_has_fma;
  }
  
+static bool simd_check_avx2(void)
+{
+    return cpu_has_avx2;
+}
+
  static void simd_set_regs(struct cpu_user_regs *regs)
  {
      if ( cpu_has_mmx )
@@ -133,22 +135,6 @@ static const struct {
      SIMD(SSE4 packed u32,        sse4,      16u4),
      SIMD(SSE4 packed s64,        sse4,      16i8),
      SIMD(SSE4 packed u64,        sse4,      16u8),
-    SIMD(SSE2/AVX packed s8,     sse2_avx,  16i1),
-    SIMD(SSE2/AVX packed u8,     sse2_avx,  16u1),
-    SIMD(SSE2/AVX packed s16,    sse2_avx,  16i2),
-    SIMD(SSE2/AVX packed u16,    sse2_avx,  16u2),
-    SIMD(SSE2/AVX packed s32,    sse2_avx,  16i4),
-    SIMD(SSE2/AVX packed u32,    sse2_avx,  16u4),
-    SIMD(SSE2/AVX packed s64,    sse2_avx,  16i8),
-    SIMD(SSE2/AVX packed u64,    sse2_avx,  16u8),
-    SIMD(SSE4/AVX packed s8,     sse4_avx,  16i1),
-    SIMD(SSE4/AVX packed u8,     sse4_avx,  16u1),
-    SIMD(SSE4/AVX packed s16,    sse4_avx,  16i2),
-    SIMD(SSE4/AVX packed u16,    sse4_avx,  16u2),
-    SIMD(SSE4/AVX packed s32,    sse4_avx,  16i4),
-    SIMD(SSE4/AVX packed u32,    sse4_avx,  16u4),
-    SIMD(SSE4/AVX packed s64,    sse4_avx,  16i8),
-    SIMD(SSE4/AVX packed u64,    sse4_avx,  16u8),
      SIMD(AVX scalar single,      avx,         f4),
      SIMD(AVX 128bit single,      avx,       16f4),
      SIMD(AVX 256bit single,      avx,       32f4),
@@ -167,6 +153,26 @@ static const struct {
      SIMD(FMA scalar double,      fma,         f8),
      SIMD(FMA 128bit double,      fma,       16f8),
      SIMD(FMA 256bit double,      fma,       32f8),
+    SIMD(AVX2 128bit single,     avx2,      16f4),
+    SIMD(AVX2 256bit single,     avx2,      32f4),
+    SIMD(AVX2 128bit double,     avx2,      16f8),
+    SIMD(AVX2 256bit double,     avx2,      32f8),
+    SIMD(AVX2 s8x16,             avx2,      16i1),
+    SIMD(AVX2 u8x16,             avx2,      16u1),
+    SIMD(AVX2 s16x8,             avx2,      16i2),
+    SIMD(AVX2 u16x8,             avx2,      16u2),
+    SIMD(AVX2 s32x4,             avx2,      16i4),
+    SIMD(AVX2 u32x4,             avx2,      16u4),
+    SIMD(AVX2 s64x2,             avx2,      16i8),
+    SIMD(AVX2 u64x2,             avx2,      16u8),
+    SIMD(AVX2 s8x32,             avx2,      32i1),
+    SIMD(AVX2 u8x32,             avx2,      32u1),
+    SIMD(AVX2 s16x16,            avx2,      32i2),
+    SIMD(AVX2 u16x16,            avx2,      32u2),
+    SIMD(AVX2 s32x8,             avx2,      32i4),
+    SIMD(AVX2 u32x8,             avx2,      32u4),
+    SIMD(AVX2 s64x4,             avx2,      32i8),
+    SIMD(AVX2 u64x4,             avx2,      32u8),
  #undef SIMD_
  #undef SIMD
  };
@@ -2970,6 +2976,91 @@ int main(int argc, char **argv)
      else
          printf("skipped\n");
  
+    printf("%-40s", "Testing vpmaskmovd %xmm1,%xmm2,(%edx)...");
+    if ( stack_exec && cpu_has_avx2 )
+    {
+        decl_insn(vpmaskmovd);
+
+        asm volatile ( "vpxor %%xmm1, %%xmm1, %%xmm1\n\t"
+                       "vpinsrd $0b00, %1, %%xmm1, %%xmm2\n\t"
+#if 0 /* Don't use AVX2 instructions for now */
+                       put_insn(vpmaskmovd, "vpmaskmovd %%xmm1, %%xmm2, (%0)")
+#else
+                       put_insn(vpmaskmovd,
+                                ".byte 0xc4, 0xe2, 0x69, 0x8e, 0x0a")
+#endif
+                       :: "d" (NULL), "r" (~0) );
+
+        memset(res + MMAP_SZ / sizeof(*res) - 8, 0xdb, 32);
+        set_insn(vpmaskmovd);
+        regs.edx = (unsigned long)res + MMAP_SZ - 4;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(vpmaskmovd) ||
+             res[MMAP_SZ / sizeof(*res) - 1] ||
+             memcmp(res + MMAP_SZ / sizeof(*res) - 8,
+                    res + MMAP_SZ / sizeof(*res) - 4, 12) )
+            goto fail;
+
+        asm volatile ( "vpinsrd $0b11, %0, %%xmm1, %%xmm2" :: "r" (~0) );
+        memset(res, 0xdb, 32);
+        set_insn(vpmaskmovd);
+        regs.edx = (unsigned long)(res - 3);
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(vpmaskmovd) ||
+             res[0] || memcmp(res + 1, res + 4, 12) )
+            goto fail;
+
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
+    printf("%-40s", "Testing vpmaskmovq %xmm1,%xmm2,(%edx)...");
+    if ( stack_exec && cpu_has_avx2 )
+    {
+        decl_insn(vpmaskmovq);
+
+        asm volatile ( "vpxor %%xmm1, %%xmm1, %%xmm1\n\t"
+                       "vpcmpeqd %%xmm0, %%xmm0, %%xmm0\n\t"
+#if 0 /* Don't use AVX2 instructions for now */
+                       "vpblendd $0b0011, %%xmm0, %%xmm1, %%xmm2\n\t"
+                       put_insn(vpmaskmovq, "vpmaskmovq %%xmm1, %%xmm2, (%0)")
+#else
+                       ".byte 0xc4, 0xe3, 0x71, 0x02, 0xd0, 0b0011\n\t"
+                       put_insn(vpmaskmovq,
+                                ".byte 0xc4, 0xe2, 0xe9, 0x8e, 0x0a")
+#endif
+                       :: "d" (NULL) );
+
+        memset(res + MMAP_SZ / sizeof(*res) - 8, 0xdb, 32);
+        set_insn(vpmaskmovq);
+        regs.edx = (unsigned long)res + MMAP_SZ - 8;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(vpmaskmovq) ||
+             res[MMAP_SZ / sizeof(*res) - 1] ||
+             res[MMAP_SZ / sizeof(*res) - 2] ||
+             memcmp(res + MMAP_SZ / sizeof(*res) - 8,
+                    res + MMAP_SZ / sizeof(*res) - 4, 8) )
+            goto fail;
+
+#if 0 /* Don't use AVX2 instructions for now */
+        asm volatile ( "vpermq $0b00000001, %ymm2, %ymm2" );
+#else
+        asm volatile ( ".byte 0xc4, 0xe3, 0xfd, 0x00, 0xd2, 0b00000001" );
+#endif
+        memset(res, 0xdb, 32);
+        set_insn(vpmaskmovq);
+        regs.edx = (unsigned long)(res - 2);
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(vpmaskmovq) ||
+             res[0] || res[1] || memcmp(res + 2, res + 4, 8) )
+            goto fail;
+
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
      printf("%-40s", "Testing stmxcsr (%edx)...");
      if ( cpu_has_sse )
      {
diff --git a/xen/arch/x86/x86_emulate/x86_emulate.c b/xen/arch/x86/x86_emulate/x86_emulate.c

index ad347cc4e83d0f6c592afc9532c060907e837f5d..5ea6ba1f2681ec47de43c2570c3bc9c3e8c3866a 100644 (file)
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -370,7 +370,7 @@ static const struct ext0f38_table {
      [0x0c ... 0x0f] = { .simd_size = simd_packed_fp },
      [0x10] = { .simd_size = simd_packed_int },
      [0x13] = { .simd_size = simd_other, .two_op = 1 },
-    [0x14 ... 0x15] = { .simd_size = simd_packed_fp },
+    [0x14 ... 0x16] = { .simd_size = simd_packed_fp },
      [0x17] = { .simd_size = simd_packed_int, .two_op = 1 },
      [0x18 ... 0x19] = { .simd_size = simd_scalar_fp, .two_op = 1 },
      [0x1a] = { .simd_size = simd_128, .two_op = 1 },
@@ -382,9 +382,15 @@ static const struct ext0f38_table {
      [0x2c ... 0x2d] = { .simd_size = simd_other },
      [0x2e ... 0x2f] = { .simd_size = simd_other, .to_mem = 1 },
      [0x30 ... 0x35] = { .simd_size = simd_other, .two_op = 1 },
-    [0x37 ... 0x3f] = { .simd_size = simd_packed_int },
+    [0x36 ... 0x3f] = { .simd_size = simd_packed_int },
      [0x40] = { .simd_size = simd_packed_int },
      [0x41] = { .simd_size = simd_packed_int, .two_op = 1 },
+    [0x45 ... 0x47] = { .simd_size = simd_packed_int },
+    [0x58 ... 0x59] = { .simd_size = simd_other, .two_op = 1 },
+    [0x5a] = { .simd_size = simd_128, .two_op = 1 },
+    [0x78 ... 0x79] = { .simd_size = simd_other, .two_op = 1 },
+    [0x8c] = { .simd_size = simd_other },
+    [0x8e] = { .simd_size = simd_other, .to_mem = 1 },
      [0x96 ... 0x9f] = { .simd_size = simd_packed_fp },
      [0xa6 ... 0xaf] = { .simd_size = simd_packed_fp },
      [0xb6 ... 0xbf] = { .simd_size = simd_packed_fp },
@@ -406,6 +412,9 @@ static const struct ext0f3a_table {
      uint8_t two_op:1;
      uint8_t four_op:1;
  } ext0f3a_table[256] = {
+    [0x00] = { .simd_size = simd_packed_int, .two_op = 1 },
+    [0x01] = { .simd_size = simd_packed_fp, .two_op = 1 },
+    [0x02] = { .simd_size = simd_packed_int },
      [0x04 ... 0x05] = { .simd_size = simd_packed_fp, .two_op = 1 },
      [0x06] = { .simd_size = simd_packed_fp },
      [0x08 ... 0x09] = { .simd_size = simd_packed_fp, .two_op = 1 },
@@ -419,9 +428,12 @@ static const struct ext0f3a_table {
      [0x20] = { .simd_size = simd_none },
      [0x21] = { .simd_size = simd_other },
      [0x22] = { .simd_size = simd_none },
+    [0x38] = { .simd_size = simd_128 },
+    [0x39] = { .simd_size = simd_128, .to_mem = 1, .two_op = 1 },
      [0x40 ... 0x41] = { .simd_size = simd_packed_fp },
      [0x42] = { .simd_size = simd_packed_int },
      [0x44] = { .simd_size = simd_packed_int },
+    [0x46] = { .simd_size = simd_packed_int },
      [0x4a ... 0x4b] = { .simd_size = simd_packed_fp, .four_op = 1 },
      [0x4c] = { .simd_size = simd_packed_int, .four_op = 1 },
      [0x5c ... 0x5f] = { .simd_size = simd_packed_fp, .four_op = 1 },
@@ -5947,6 +5959,10 @@ x86_emulate(
      case X86EMUL_OPC_VEX_66(0x0f38, 0x40): /* vpmulld {x,y}mm/mem,{x,y}mm,{x,y}mm */
              if ( !vex.l )
                  goto simd_0f_avx;
+            /* fall through */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x45): /* vpsrlv{d,q} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x47): /* vpsllv{d,q} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    simd_0f_avx2:
              host_and_vcpu_must_have(avx2);
              goto simd_0f_ymm;
          }
@@ -6046,7 +6062,10 @@ x86_emulate(
      case X86EMUL_OPC_VEX_66(0x0f3a, 0x0f): /* vpalignr $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */
      case X86EMUL_OPC_VEX_66(0x0f3a, 0x42): /* vmpsadbw $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */
              if ( vex.l )
+            {
+    simd_0f_imm8_avx2:
                  host_and_vcpu_must_have(avx2);
+            }
              else
              {
      case X86EMUL_OPC_VEX_66(0x0f3a, 0x08): /* vroundps $imm8,{x,y}mm/mem,{x,y}mm */
@@ -7134,6 +7153,11 @@ x86_emulate(
          op_bytes = 8 << vex.l;
          goto simd_0f_ymm;
  
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x16): /* vpermps ymm/m256,ymm,ymm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x36): /* vpermd ymm/m256,ymm,ymm */
+        generate_exception_if(!vex.l || vex.w, EXC_UD);
+        goto simd_0f_avx2;
+
      case X86EMUL_OPC_VEX_66(0x0f38, 0x20): /* vpmovsxbw xmm/mem,{x,y}mm */
      case X86EMUL_OPC_VEX_66(0x0f38, 0x21): /* vpmovsxbd xmm/mem,{x,y}mm */
      case X86EMUL_OPC_VEX_66(0x0f38, 0x22): /* vpmovsxbq xmm/mem,{x,y}mm */
@@ -7250,6 +7274,80 @@ x86_emulate(
          generate_exception_if(vex.l, EXC_UD);
          goto simd_0f_avx;
  
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x58): /* vpbroadcastd xmm/m32,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x59): /* vpbroadcastq xmm/m64,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x78): /* vpbroadcastb xmm/m8,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x79): /* vpbroadcastw xmm/m16,{x,y}mm */
+        op_bytes = 1 << ((!(b & 0x20) * 2) + (b & 1));
+        /* fall through */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x46): /* vpsravd {x,y}mm/mem,{x,y}mm,{x,y}mm */
+        generate_exception_if(vex.w, EXC_UD);
+        goto simd_0f_avx2;
+
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x5a): /* vbroadcasti128 m128,ymm */
+        generate_exception_if(ea.type != OP_MEM || !vex.l || vex.w, EXC_UD);
+        goto simd_0f_avx2;
+
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x8c): /* vpmaskmov{d,q} mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x8e): /* vpmaskmov{d,q} {x,y}mm,{x,y}mm,mem */
+    {
+        typeof(vex) *pvex;
+        unsigned int mask = vex.w ? 0x80808080U : 0x88888888U;
+
+        generate_exception_if(ea.type != OP_MEM, EXC_UD);
+        host_and_vcpu_must_have(avx2);
+        get_fpu(X86EMUL_FPU_ymm, &fic);
+
+        /*
+         * While we can't reasonably provide fully correct behavior here
+         * (in particular, for writes, avoiding the memory read in anticipation
+         * of all elements in the range eventually being written), we can (and
+         * should) still limit the memory access to the smallest possible range
+         * (suppressing it altogether if all mask bits are clear), to provide
+         * correct faulting behavior. Read the mask bits via vmovmskp{s,d}
+         * for that purpose.
+         */
+        opc = init_prefixes(stub);
+        pvex = copy_VEX(opc, vex);
+        pvex->opcx = vex_0f;
+        opc[0] = 0xd7; /* vpmovmskb */
+        /* Use %rax as GPR destination and VEX.vvvv as source. */
+        pvex->r = 1;
+        pvex->b = !mode_64bit() || (vex.reg >> 3);
+        opc[1] = 0xc0 | (~vex.reg & 7);
+        pvex->reg = 0xf;
+        opc[2] = 0xc3;
+
+        invoke_stub("", "", "=a" (ea.val) : [dummy] "i" (0));
+        put_stub(stub);
+
+        /* Convert byte granular result to dword/qword granularity. */
+        ea.val &= mask;
+        if ( !ea.val )
+            goto complete_insn;
+
+        first_byte = __builtin_ctz(ea.val) & ~((4 << vex.w) - 1);
+        ea.val >>= first_byte;
+        op_bytes = 32 - __builtin_clz(ea.val);
+
+        /*
+         * Even for the memory write variant a memory read is needed, unless
+         * all set mask bits are contiguous.
+         */
+        if ( ea.val & (ea.val + ~mask + 1) )
+            d = (d & ~SrcMask) | SrcMem;
+
+        opc = init_prefixes(stub);
+        opc[0] = b;
+        /* Convert memory operand to (%rAX). */
+        rex_prefix &= ~REX_B;
+        vex.b = 1;
+        opc[1] = modrm & 0x38;
+        fic.insn_bytes = PFX_BYTES + 2;
+
+        break;
+    }
+
      case X86EMUL_OPC_VEX_66(0x0f38, 0x96): /* vfmaddsub132p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
      case X86EMUL_OPC_VEX_66(0x0f38, 0x97): /* vfmsubadd132p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
      case X86EMUL_OPC_VEX_66(0x0f38, 0x98): /* vfmadd132p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
@@ -7458,6 +7556,20 @@ x86_emulate(
                              : "0" ((uint32_t)src.val), "rm" (_regs.edx) );
          break;
  
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x00): /* vpermq $imm8,ymm/m256,ymm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x01): /* vpermpd $imm8,ymm/m256,ymm */
+        generate_exception_if(!vex.l || !vex.w, EXC_UD);
+        goto simd_0f_imm8_avx2;
+
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x38): /* vinserti128 $imm8,xmm/m128,ymm,ymm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x39): /* vextracti128 $imm8,ymm,xmm/m128 */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x46): /* vperm2i128 $imm8,ymm/m256,ymm,ymm */
+        generate_exception_if(!vex.l, EXC_UD);
+        /* fall through */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x02): /* vpblendd $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+        generate_exception_if(vex.w, EXC_UD);
+        goto simd_0f_imm8_avx2;
+
      case X86EMUL_OPC_VEX_66(0x0f3a, 0x06): /* vperm2f128 $imm8,ymm/m256,ymm,ymm */
      case X86EMUL_OPC_VEX_66(0x0f3a, 0x18): /* vinsertf128 $imm8,xmm/m128,ymm,ymm */
      case X86EMUL_OPC_VEX_66(0x0f3a, 0x19): /* vextractf128 $imm8,ymm,xmm/m128 */
@@ -7939,6 +8051,7 @@ x86_emulate(
                  {
                  case X86EMUL_OPC_VEX_66(0x0f38, 0x2e): /* vmaskmovps */
                  case X86EMUL_OPC_VEX_66(0x0f38, 0x2f): /* vmaskmovpd */
+                case X86EMUL_OPC_VEX_66(0x0f38, 0x8e): /* vpmaskmov{d,q} */
                      /* These have merge semantics; force write to occur. */
                      d |= Mov;
                      break;
author	Jan Beulich <jbeulich@suse.com>
	Mon, 5 Mar 2018 15:21:49 +0000 (16:21 +0100)
committer	Jan Beulich <jbeulich@suse.com>
	Mon, 5 Mar 2018 15:21:49 +0000 (16:21 +0100)
tools/tests/x86_emulator/Makefile		patch \| blob \| blame \| history
tools/tests/x86_emulator/simd.c		patch \| blob \| blame \| history
tools/tests/x86_emulator/test_x86_emulator.c		patch \| blob \| blame \| history
xen/arch/x86/x86_emulate/x86_emulate.c		patch \| blob \| blame \| history