x86emul: support AVX512F floating-point conversion insns

author Jan Beulich <jbeulich@suse.com>

Fri, 24 May 2019 08:21:30 +0000 (10:21 +0200)

committer Jan Beulich <jbeulich@suse.com>

Fri, 24 May 2019 08:21:30 +0000 (10:21 +0200)
author Jan Beulich <jbeulich@suse.com>
Fri, 24 May 2019 08:21:30 +0000 (10:21 +0200)
committer Jan Beulich <jbeulich@suse.com>
Fri, 24 May 2019 08:21:30 +0000 (10:21 +0200)
diff --git a/tools/tests/x86_emulator/evex-disp8.c b/tools/tests/x86_emulator/evex-disp8.c

index 77609127002bb901374cb721d60282ffdfa78526..c808fed6cc8ef9bfec71f0f764684e8eeda61620 100644 (file)
--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -109,6 +109,12 @@ static const struct test avx512f_all[] = {
      INSN_FP(cmp,             0f, c2),
      INSN(comisd,       66,   0f, 2f,    el,      q, el),
      INSN(comiss,         ,   0f, 2f,    el,      d, el),
+    INSN(cvtpd2ps,     66,   0f, 5a,    vl,      q, vl),
+    INSN(cvtph2ps,     66, 0f38, 13,    vl_2, d_nb, vl),
+    INSN(cvtps2pd,       ,   0f, 5a,    vl_2,    d, vl),
+    INSN(cvtps2ph,     66, 0f3a, 1d,    vl_2, d_nb, vl),
+    INSN(cvtsd2ss,     f2,   0f, 5a,    el,      q, el),
+    INSN(cvtss2sd,     f3,   0f, 5a,    el,      d, el),
      INSN_FP(div,             0f, 5e),
      INSN(fmadd132,     66, 0f38, 98,    vl,     sd, vl),
      INSN(fmadd132,     66, 0f38, 99,    el,     sd, el),
diff --git a/tools/tests/x86_emulator/simd.c b/tools/tests/x86_emulator/simd.c

index bbe0288f09c7b0f7ffb8c5d6dcdf9b1d6c21e1d5..fd575d6a9e4f789cc55ffd7a26703a1037154b6e 100644 (file)
--- a/tools/tests/x86_emulator/simd.c
+++ b/tools/tests/x86_emulator/simd.c
@@ -181,7 +181,9 @@ static inline bool _to_bool(byte_vec_t bv)
  #  define max(x, y) BR_(maxps, _mask, x, y, undef(), ~0)
  #  define min(x, y) BR_(minps, _mask, x, y, undef(), ~0)
  #  define mix(x, y) B(movaps, _mask, x, y, (0b0101010101010101 & ALL_TRUE))
+#  define shrink1(x) BR_(cvtpd2ps, _mask, (vdf_t)(x), (vsf_half_t){}, ~0)
  #  define sqrt(x) BR(sqrtps, _mask, x, undef(), ~0)
+#  define widen1(x) ((vec_t)BR(cvtps2pd, _mask, x, (vdf_t)undef(), ~0))
  #  if VEC_SIZE == 16
  #   define interleave_hi(x, y) B(unpckhps, _mask, x, y, undef(), ~0)
  #   define interleave_lo(x, y) B(unpcklps, _mask, x, y, undef(), ~0)
diff --git a/tools/tests/x86_emulator/simd.h b/tools/tests/x86_emulator/simd.h

index e436fc04721968ac20b8cfd432b7e9cbfe308d9f..8cf6ef27670122a588dd24f2b55109eb9ebcf1bc 100644 (file)
--- a/tools/tests/x86_emulator/simd.h
+++ b/tools/tests/x86_emulator/simd.h
@@ -68,6 +68,7 @@ typedef short __attribute__((vector_size(VEC_SIZE))) vhi_t;
  typedef int __attribute__((vector_size(VEC_SIZE))) vsi_t;
  #if VEC_SIZE >= 8
  typedef long long __attribute__((vector_size(VEC_SIZE))) vdi_t;
+typedef double __attribute__((vector_size(VEC_SIZE))) vdf_t;
  #endif
  
  #if ELEM_SIZE == 1
@@ -93,6 +94,7 @@ typedef char __attribute__((vector_size(HALF_SIZE))) vqi_half_t;
  typedef short __attribute__((vector_size(HALF_SIZE))) vhi_half_t;
  typedef int __attribute__((vector_size(HALF_SIZE))) vsi_half_t;
  typedef long long __attribute__((vector_size(HALF_SIZE))) vdi_half_t;
+typedef float __attribute__((vector_size(HALF_SIZE))) vsf_half_t;
  # endif
  
  # if ELEM_COUNT >= 4
@@ -328,6 +330,13 @@ REN(pandn, , d);
  REN(por, , d);
  REN(pxor, , d);
  #  endif
+OVR(cvtpd2psx);
+OVR(cvtpd2psy);
+OVR(cvtph2ps);
+OVR(cvtps2pd);
+OVR(cvtps2ph);
+OVR(cvtsd2ss);
+OVR(cvtss2sd);
  OVR(movddup);
  OVR(movntdq);
  OVR(movntdqa);
diff --git a/tools/tests/x86_emulator/test_x86_emulator.c b/tools/tests/x86_emulator/test_x86_emulator.c

index 59212f9a7b30e1e53f2b9fabb5fab6bec2c91d87..bff930fb8e109967ca16c0b39d699284ea6931ab 100644 (file)
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -3871,6 +3871,49 @@ int main(int argc, char **argv)
      else
          printf("skipped\n");
  
+    printf("%-40s", "Testing vcvtph2ps 32(%ecx),%zmm7{%k4}...");
+    if ( stack_exec && cpu_has_avx512f )
+    {
+        decl_insn(evex_vcvtph2ps);
+        decl_insn(evex_vcvtps2ph);
+
+        asm volatile ( "vpternlogd $0x81, %%zmm7, %%zmm7, %%zmm7\n\t"
+                       "kmovw %1,%%k4\n"
+                       put_insn(evex_vcvtph2ps, "vcvtph2ps 32(%0), %%zmm7%{%%k4%}")
+                       :: "c" (NULL), "r" (0x3333) );
+
+        set_insn(evex_vcvtph2ps);
+        memset(res, 0xff, 128);
+        res[8] = 0x40003c00; /* (1.0, 2.0) */
+        res[10] = 0x44004200; /* (3.0, 4.0) */
+        res[12] = 0x3400b800; /* (-.5, .25) */
+        res[14] = 0xbc000000; /* (0.0, -1.) */
+        regs.ecx = (unsigned long)res;
+        rc = x86_emulate(&ctxt, &emulops);
+        asm volatile ( "vmovups %%zmm7, %0" : "=m" (res[16]) );
+        if ( rc != X86EMUL_OKAY || !check_eip(evex_vcvtph2ps) )
+            goto fail;
+        printf("okay\n");
+
+        printf("%-40s", "Testing vcvtps2ph $0,%zmm3,64(%edx){%k4}...");
+        asm volatile ( "vmovups %0, %%zmm3\n"
+                       put_insn(evex_vcvtps2ph, "vcvtps2ph $0, %%zmm3, 128(%1)%{%%k4%}")
+                       :: "m" (res[16]), "d" (NULL) );
+
+        set_insn(evex_vcvtps2ph);
+        regs.edx = (unsigned long)res;
+        memset(res + 32, 0xcc, 32);
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(evex_vcvtps2ph) )
+            goto fail;
+        res[15] = res[13] = res[11] = res[9] = 0xcccccccc;
+        if ( memcmp(res + 8, res + 32, 32) )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
  #undef decl_insn
  #undef put_insn
  #undef set_insn
diff --git a/xen/arch/x86/x86_emulate/x86_emulate.c b/xen/arch/x86/x86_emulate/x86_emulate.c

index 35315c7aa1ebb6212ff69621ae5b28647e621d70..2cad8c3741771b3e2cf6ac622a7bad4625d9a2ca 100644 (file)
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -310,7 +310,8 @@ static const struct twobyte_table {
      [0x52 ... 0x53] = { DstImplicit|SrcMem|ModRM|TwoOp, simd_single_fp },
      [0x54 ... 0x57] = { DstImplicit|SrcMem|ModRM, simd_packed_fp, d8s_vl },
      [0x58 ... 0x59] = { DstImplicit|SrcMem|ModRM, simd_any_fp, d8s_vl },
-    [0x5a ... 0x5b] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
+    [0x5a] = { DstImplicit|SrcMem|ModRM|Mov, simd_any_fp, d8s_vl },
+    [0x5b] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
      [0x5c ... 0x5f] = { DstImplicit|SrcMem|ModRM, simd_any_fp, d8s_vl },
      [0x60 ... 0x62] = { DstImplicit|SrcMem|ModRM, simd_other, d8s_vl },
      [0x63 ... 0x67] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl },
@@ -437,7 +438,7 @@ static const struct ext0f38_table {
      [0x0c ... 0x0d] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
      [0x0e ... 0x0f] = { .simd_size = simd_packed_fp },
      [0x10 ... 0x12] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
-    [0x13] = { .simd_size = simd_other, .two_op = 1 },
+    [0x13] = { .simd_size = simd_other, .two_op = 1, .d8s = d8s_vl_by_2 },
      [0x14 ... 0x16] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
      [0x17] = { .simd_size = simd_packed_int, .two_op = 1 },
      [0x18] = { .simd_size = simd_scalar_opc, .two_op = 1, .d8s = 2 },
@@ -541,7 +542,7 @@ static const struct ext0f3a_table {
      [0x19] = { .simd_size = simd_128, .to_mem = 1, .two_op = 1, .d8s = 4 },
      [0x1a] = { .simd_size = simd_256, .d8s = d8s_vl_by_2 },
      [0x1b] = { .simd_size = simd_256, .to_mem = 1, .two_op = 1, .d8s = d8s_vl_by_2 },
-    [0x1d] = { .simd_size = simd_other, .to_mem = 1, .two_op = 1 },
+    [0x1d] = { .simd_size = simd_other, .to_mem = 1, .two_op = 1, .d8s = d8s_vl_by_2 },
      [0x1e ... 0x1f] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
      [0x20] = { .simd_size = simd_none, .d8s = 0 },
      [0x21] = { .simd_size = simd_other, .d8s = 2 },
@@ -3039,6 +3040,11 @@ x86_decode(
                  modrm_mod = 3;
                  break;
  
+            case 0x5a: /* vcvtps2pd needs special casing */
+                if ( disp8scale && !evex.pfx && !evex.brs )
+                    --disp8scale;
+                break;
+
              case 0x7e: /* vmovq xmm/m64,xmm needs special casing */
                  if ( disp8scale == 2 && evex.pfx == vex_f3 )
                      disp8scale = 3;
@@ -5972,6 +5978,7 @@ x86_emulate(
      CASE_SIMD_ALL_FP(_EVEX, 0x0f, 0x5d):    /* vmin{p,s}{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
      CASE_SIMD_ALL_FP(_EVEX, 0x0f, 0x5e):    /* vdiv{p,s}{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
      CASE_SIMD_ALL_FP(_EVEX, 0x0f, 0x5f):    /* vmax{p,s}{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    avx512f_all_fp:
          generate_exception_if((evex.w != (evex.pfx & VEX_PREFIX_DOUBLE_MASK) ||
                                 (ea.type != OP_REG && evex.brs &&
                                  (evex.pfx & VEX_PREFIX_SCALAR_MASK))),
@@ -6531,7 +6538,7 @@ x86_emulate(
          goto simd_zmm;
  
      CASE_SIMD_ALL_FP(, 0x0f, 0x5a):        /* cvt{p,s}{s,d}2{p,s}{s,d} xmm/mem,xmm */
-    CASE_SIMD_ALL_FP(_VEX, 0x0f, 0x5a):    /* vcvtp{s,d}2p{s,d} xmm/mem,xmm */
+    CASE_SIMD_ALL_FP(_VEX, 0x0f, 0x5a):    /* vcvtp{s,d}2p{s,d} {x,y}mm/mem,{x,y}mm */
                                             /* vcvts{s,d}2s{s,d} xmm/mem,xmm,xmm */
          op_bytes = 4 << (((vex.pfx & VEX_PREFIX_SCALAR_MASK) ? 0 : 1 + vex.l) +
                           !!(vex.pfx & VEX_PREFIX_DOUBLE_MASK));
@@ -6540,6 +6547,12 @@ x86_emulate(
              goto simd_0f_sse2;
          goto simd_0f_avx;
  
+    CASE_SIMD_ALL_FP(_EVEX, 0x0f, 0x5a):   /* vcvtp{s,d}2p{s,d} [xyz]mm/mem,[xyz]mm{k} */
+                                           /* vcvts{s,d}2s{s,d} xmm/mem,xmm,xmm{k} */
+        op_bytes = 4 << (((evex.pfx & VEX_PREFIX_SCALAR_MASK) ? 0 : 1 + evex.lr) +
+                         evex.w);
+        goto avx512f_all_fp;
+
      CASE_SIMD_PACKED_FP(, 0x0f, 0x5b):     /* cvt{ps,dq}2{dq,ps} xmm/mem,xmm */
      CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x5b): /* vcvt{ps,dq}2{dq,ps} {x,y}mm/mem,{x,y}mm */
      case X86EMUL_OPC_F3(0x0f, 0x5b):       /* cvttps2dq xmm/mem,xmm */
@@ -8431,6 +8444,15 @@ x86_emulate(
          op_bytes = 8 << vex.l;
          goto simd_0f_ymm;
  
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x13): /* vcvtph2ps {x,y}mm/mem,[xyz]mm{k} */
+        generate_exception_if(evex.w || (ea.type != OP_REG && evex.brs), EXC_UD);
+        host_and_vcpu_must_have(avx512f);
+        if ( !evex.brs )
+            avx512_vlen_check(false);
+        op_bytes = 8 << evex.lr;
+        elem_bytes = 2;
+        goto simd_zmm;
+
      case X86EMUL_OPC_VEX_66(0x0f38, 0x16): /* vpermps ymm/m256,ymm,ymm */
      case X86EMUL_OPC_VEX_66(0x0f38, 0x36): /* vpermd ymm/m256,ymm,ymm */
          generate_exception_if(!vex.l || vex.w, EXC_UD);
@@ -9262,27 +9284,79 @@ x86_emulate(
          goto avx512f_imm8_no_sae;
  
      case X86EMUL_OPC_VEX_66(0x0f3a, 0x1d): /* vcvtps2ph $imm8,{x,y}mm,xmm/mem */
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x1d): /* vcvtps2ph $imm8,[xyz]mm,{x,y}mm/mem{k} */
      {
          uint32_t mxcsr;
  
-        generate_exception_if(vex.w || vex.reg != 0xf, EXC_UD);
-        host_and_vcpu_must_have(f16c);
          fail_if(!ops->write);
+        if ( evex_encoded() )
+        {
+            generate_exception_if((evex.w || evex.reg != 0xf || !evex.RX ||
+                                   (ea.type != OP_REG && (evex.z || evex.brs))),
+                                  EXC_UD);
+            host_and_vcpu_must_have(avx512f);
+            avx512_vlen_check(false);
+            opc = init_evex(stub);
+        }
+        else
+        {
+            generate_exception_if(vex.w || vex.reg != 0xf, EXC_UD);
+            host_and_vcpu_must_have(f16c);
+            opc = init_prefixes(stub);
+        }
+
+        op_bytes = 8 << evex.lr;
  
-        opc = init_prefixes(stub);
          opc[0] = b;
          opc[1] = modrm;
          if ( ea.type == OP_MEM )
          {
              /* Convert memory operand to (%rAX). */
              vex.b = 1;
+            evex.b = 1;
              opc[1] &= 0x38;
          }
          opc[2] = imm1;
-        insn_bytes = PFX_BYTES + 3;
+        if ( evex_encoded() )
+        {
+            unsigned int full = 0;
+
+            insn_bytes = EVEX_PFX_BYTES + 3;
+            copy_EVEX(opc, evex);
+
+            if ( ea.type == OP_MEM && evex.opmsk )
+            {
+                full = 0xffff >> (16 - op_bytes / 2);
+                op_mask &= full;
+                if ( !op_mask )
+                    goto complete_insn;
+
+                first_byte = __builtin_ctz(op_mask);
+                op_mask >>= first_byte;
+                full >>= first_byte;
+                first_byte <<= 1;
+                op_bytes = (32 - __builtin_clz(op_mask)) << 1;
+
+                /*
+                 * We may need to read (parts of) the memory operand for the
+                 * purpose of merging in order to avoid splitting the write
+                 * below into multiple ones.
+                 */
+                if ( op_mask != full &&
+                     (rc = ops->read(ea.mem.seg,
+                                     truncate_ea(ea.mem.off + first_byte),
+                                     (void *)mmvalp + first_byte, op_bytes,
+                                     ctxt)) != X86EMUL_OKAY )
+                    goto done;
+            }
+        }
+        else
+        {
+            insn_bytes = PFX_BYTES + 3;
+            copy_VEX(opc, vex);
+        }
          opc[3] = 0xc3;
  
-        copy_VEX(opc, vex);
          /* Latch MXCSR - we may need to restore it below. */
          invoke_stub("stmxcsr %[mxcsr]", "",
                      "=m" (*mmvalp), [mxcsr] "=m" (mxcsr) : "a" (mmvalp));
@@ -9291,7 +9365,8 @@ x86_emulate(
  
          if ( ea.type == OP_MEM )
          {
-            rc = ops->write(ea.mem.seg, ea.mem.off, mmvalp, 8 << vex.l, ctxt);
+            rc = ops->write(ea.mem.seg, truncate_ea(ea.mem.off + first_byte),
+                            (void *)mmvalp + first_byte, op_bytes, ctxt);
              if ( rc != X86EMUL_OKAY )
              {
                  asm volatile ( "ldmxcsr %0" :: "m" (mxcsr) );
author	Jan Beulich <jbeulich@suse.com>
	Fri, 24 May 2019 08:21:30 +0000 (10:21 +0200)
committer	Jan Beulich <jbeulich@suse.com>
	Fri, 24 May 2019 08:21:30 +0000 (10:21 +0200)
tools/tests/x86_emulator/evex-disp8.c		patch \| blob \| blame \| history
tools/tests/x86_emulator/simd.c		patch \| blob \| blame \| history
tools/tests/x86_emulator/simd.h		patch \| blob \| blame \| history
tools/tests/x86_emulator/test_x86_emulator.c		patch \| blob \| blame \| history
xen/arch/x86/x86_emulate/x86_emulate.c		patch \| blob \| blame \| history