INSN(pcmpgtd, 66, 0f, 66, vl, d, vl),
INSN(pcmpgtq, 66, 0f38, 37, vl, q, vl),
INSN(pcmpu, 66, 0f3a, 1e, vl, dq, vl),
+ INSN(permi2, 66, 0f38, 76, vl, dq, vl),
+ INSN(permi2, 66, 0f38, 77, vl, sd, vl),
+ INSN(permt2, 66, 0f38, 7e, vl, dq, vl),
+ INSN(permt2, 66, 0f38, 7f, vl, sd, vl),
INSN(pmaxs, 66, 0f38, 3d, vl, dq, vl),
INSN(pmaxu, 66, 0f38, 3f, vl, dq, vl),
INSN(pmins, 66, 0f38, 39, vl, dq, vl),
INSN(pcmpgtb, 66, 0f, 64, vl, b, vl),
INSN(pcmpgtw, 66, 0f, 65, vl, w, vl),
INSN(pcmpu, 66, 0f3a, 3e, vl, bw, vl),
+ INSN(permi2w, 66, 0f38, 75, vl, w, vl),
+ INSN(permt2w, 66, 0f38, 7d, vl, w, vl),
INSN(pmaddwd, 66, 0f, f5, vl, w, vl),
INSN(pmaxsb, 66, 0f38, 3c, vl, b, vl),
INSN(pmaxsw, 66, 0f, ee, vl, w, vl),
INSN(inserti32x8, 66, 0f3a, 3a, el_8, d, vl),
};
+static const struct test avx512_vbmi_all[] = {
+ INSN(permi2b, 66, 0f38, 75, vl, b, vl),
+ INSN(permt2b, 66, 0f38, 7d, vl, b, vl),
+};
+
static const unsigned char vl_all[] = { VL_512, VL_128, VL_256 };
static const unsigned char vl_128[] = { VL_128 };
static const unsigned char vl_no128[] = { VL_512, VL_256 };
RUN(avx512dq, 128);
RUN(avx512dq, no128);
RUN(avx512dq, 512);
+ RUN(avx512_vbmi, all);
}
# define interleave_hi(x, y) B(unpckhps, _mask, x, y, undef(), ~0)
# define interleave_lo(x, y) B(unpcklps, _mask, x, y, undef(), ~0)
# define swap(x) B(shufps, _mask, x, x, 0b00011011, undef(), ~0)
+# else
+# define interleave_hi(x, y) B(vpermi2varps, _mask, x, interleave_hi, y, ~0)
+# define interleave_lo(x, y) B(vpermt2varps, _mask, interleave_lo, x, y, ~0)
# endif
# elif FLOAT_SIZE == 8
# if VEC_SIZE >= 32
# define interleave_hi(x, y) B(unpckhpd, _mask, x, y, undef(), ~0)
# define interleave_lo(x, y) B(unpcklpd, _mask, x, y, undef(), ~0)
# define swap(x) B(shufpd, _mask, x, x, 0b01, undef(), ~0)
+# else
+# define interleave_hi(x, y) B(vpermi2varpd, _mask, x, interleave_hi, y, ~0)
+# define interleave_lo(x, y) B(vpermt2varpd, _mask, interleave_lo, x, y, ~0)
# endif
# endif
#elif FLOAT_SIZE == 4 && defined(__SSE__)
# if VEC_SIZE == 16
# define interleave_hi(x, y) ((vec_t)B(punpckhdq, _mask, (vsi_t)(x), (vsi_t)(y), (vsi_t)undef(), ~0))
# define interleave_lo(x, y) ((vec_t)B(punpckldq, _mask, (vsi_t)(x), (vsi_t)(y), (vsi_t)undef(), ~0))
+# else
+# define interleave_hi(x, y) ((vec_t)B(vpermi2vard, _mask, (vsi_t)(x), interleave_hi, (vsi_t)(y), ~0))
+# define interleave_lo(x, y) ((vec_t)B(vpermt2vard, _mask, interleave_lo, (vsi_t)(x), (vsi_t)(y), ~0))
# endif
# define mix(x, y) ((vec_t)B(movdqa32_, _mask, (vsi_t)(x), (vsi_t)(y), \
(0b0101010101010101 & ((1 << ELEM_COUNT) - 1))))
# if VEC_SIZE == 16
# define interleave_hi(x, y) ((vec_t)B(punpckhqdq, _mask, (vdi_t)(x), (vdi_t)(y), (vdi_t)undef(), ~0))
# define interleave_lo(x, y) ((vec_t)B(punpcklqdq, _mask, (vdi_t)(x), (vdi_t)(y), (vdi_t)undef(), ~0))
+# else
+# define interleave_hi(x, y) ((vec_t)B(vpermi2varq, _mask, (vdi_t)(x), interleave_hi, (vdi_t)(y), ~0))
+# define interleave_lo(x, y) ((vec_t)B(vpermt2varq, _mask, interleave_lo, (vdi_t)(x), (vdi_t)(y), ~0))
# endif
# define mix(x, y) ((vec_t)B(movdqa64_, _mask, (vdi_t)(x), (vdi_t)(y), 0b01010101))
# endif
{
unsigned int i, j;
vec_t x, y, z, src, inv, alt, sh;
+ vint_t interleave_lo, interleave_hi;
for ( i = 0, j = ELEM_SIZE << 3; i < ELEM_COUNT; ++i )
{
if ( !(i & (i + 1)) )
--j;
sh[i] = j;
+
+ interleave_lo[i] = ((i & 1) * ELEM_COUNT) | (i >> 1);
+ interleave_hi[i] = interleave_lo[i] + (ELEM_COUNT / 2);
}
touch(src);
x = src * alt;
y = interleave_lo(x, alt < 0);
touch(x);
- z = widen1(x);
+ z = widen1(low_half(x));
touch(x);
if ( !eq(z, y) ) return __LINE__;
# ifdef widen1
touch(src);
- x = widen1(src);
+ x = widen1(low_half(src));
touch(src);
if ( !eq(x, y) ) return __LINE__;
# endif
typedef long long __attribute__((vector_size(VEC_SIZE))) vdi_t;
#endif
+#if ELEM_SIZE == 1
+typedef vqi_t vint_t;
+#elif ELEM_SIZE == 2
+typedef vhi_t vint_t;
+#elif ELEM_SIZE == 4
+typedef vsi_t vint_t;
+#elif ELEM_SIZE == 8
+typedef vdi_t vint_t;
+#endif
+
#if VEC_SIZE >= 16
# if ELEM_COUNT >= 2
#define cpu_has_avx512dq (cp.feat.avx512dq && xcr0_mask(0xe6))
#define cpu_has_avx512bw (cp.feat.avx512bw && xcr0_mask(0xe6))
#define cpu_has_avx512vl (cp.feat.avx512vl && xcr0_mask(0xe6))
+#define cpu_has_avx512_vbmi (cp.feat.avx512_vbmi && xcr0_mask(0xe6))
#define cpu_has_xgetbv1 (cpu_has_xsave && cp.xstate.xgetbv1)
[0x59] = { .simd_size = simd_other, .two_op = 1, .d8s = 3 },
[0x5a] = { .simd_size = simd_128, .two_op = 1, .d8s = 4 },
[0x5b] = { .simd_size = simd_256, .two_op = 1, .d8s = d8s_vl_by_2 },
+ [0x75 ... 0x76] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
+ [0x77] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
[0x78] = { .simd_size = simd_other, .two_op = 1 },
[0x79] = { .simd_size = simd_other, .two_op = 1, .d8s = 1 },
[0x7a ... 0x7c] = { .simd_size = simd_none, .two_op = 1 },
+ [0x7d ... 0x7e] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
+ [0x7f] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
[0x8c] = { .simd_size = simd_packed_int },
[0x8e] = { .simd_size = simd_packed_int, .to_mem = 1 },
[0x90 ... 0x93] = { .simd_size = simd_other, .vsib = 1 },
#define vcpu_has_sha() (ctxt->cpuid->feat.sha)
#define vcpu_has_avx512bw() (ctxt->cpuid->feat.avx512bw)
#define vcpu_has_avx512vl() (ctxt->cpuid->feat.avx512vl)
+#define vcpu_has_avx512_vbmi() (ctxt->cpuid->feat.avx512_vbmi)
#define vcpu_has_rdpid() (ctxt->cpuid->feat.rdpid)
#define vcpu_must_have(feat) \
CASE_SIMD_PACKED_FP(_EVEX, 0x0f, 0x15): /* vunpckhp{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
generate_exception_if(evex.w != (evex.pfx & VEX_PREFIX_DOUBLE_MASK),
EXC_UD);
+ /* fall through */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0x76): /* vpermi2{d,q} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0x77): /* vpermi2p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0x7e): /* vpermt2{d,q} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0x7f): /* vpermt2p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
fault_suppression = false;
/* fall through */
case X86EMUL_OPC_EVEX_66(0x0f, 0xdb): /* vpand{d,q} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
generate_exception_if(ea.type != OP_MEM || !vex.l || vex.w, EXC_UD);
goto simd_0f_avx2;
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0x75): /* vpermi2{b,w} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0x7d): /* vpermt2{b,w} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+ if ( !evex.w )
+ host_and_vcpu_must_have(avx512_vbmi);
+ else
+ host_and_vcpu_must_have(avx512bw);
+ generate_exception_if(evex.brs, EXC_UD);
+ fault_suppression = false;
+ goto avx512f_no_sae;
+
case X86EMUL_OPC_EVEX_66(0x0f38, 0x78): /* vpbroadcastb xmm/m8,[xyz]mm{k} */
case X86EMUL_OPC_EVEX_66(0x0f38, 0x79): /* vpbroadcastw xmm/m16,[xyz]mm{k} */
host_and_vcpu_must_have(avx512bw);
#define cpu_has_avx512vl boot_cpu_has(X86_FEATURE_AVX512VL)
/* CPUID level 0x00000007:0.ecx */
+#define cpu_has_avx512_vbmi boot_cpu_has(X86_FEATURE_AVX512_VBMI)
#define cpu_has_rdpid boot_cpu_has(X86_FEATURE_RDPID)
/* CPUID level 0x80000007.edx */
/* Intel-defined CPU features, CPUID level 0x00000007:0.ecx, word 6 */
XEN_CPUFEATURE(PREFETCHWT1, 6*32+ 0) /*A PREFETCHWT1 instruction */
-XEN_CPUFEATURE(AVX512VBMI, 6*32+ 1) /*A AVX-512 Vector Byte Manipulation Instrs */
+XEN_CPUFEATURE(AVX512_VBMI, 6*32+ 1) /*A AVX-512 Vector Byte Manipulation Instrs */
XEN_CPUFEATURE(UMIP, 6*32+ 2) /*S User Mode Instruction Prevention */
XEN_CPUFEATURE(PKU, 6*32+ 3) /*H Protection Keys for Userspace */
XEN_CPUFEATURE(OSPKE, 6*32+ 4) /*! OS Protection Keys Enable */
AVX2: [AVX512F],
# AVX512F is taken to mean hardware support for 512bit registers
- # (which in practice depends on the EVEX prefix to encode), and the
- # instructions themselves. All further AVX512 features are built on
- # top of AVX512F
+ # (which in practice depends on the EVEX prefix to encode) as well
+ # as mask registers, and the instructions themselves. All further
+ # AVX512 features are built on top of AVX512F
AVX512F: [AVX512DQ, AVX512IFMA, AVX512PF, AVX512ER, AVX512CD,
- AVX512BW, AVX512VL, AVX512VBMI, AVX512_4VNNIW,
- AVX512_4FMAPS, AVX512_VPOPCNTDQ],
+ AVX512BW, AVX512VL, AVX512_4VNNIW, AVX512_4FMAPS,
+ AVX512_VPOPCNTDQ],
+
+ # AVX512 extensions acting solely on vectors of bytes/words are made
+ # dependents of AVX512BW (as to requiring wider than 16-bit mask
+ # registers), despite the SDM not formally making this connection.
+ AVX512BW: [AVX512_VBMI],
# The features:
# * Single Thread Indirect Branch Predictors