]> xenbits.xensource.com Git - qemu-xen.git/commitdiff
target/i386: Destructive FP helpers for AVX
authorPaul Brook <paul@nowt.org>
Sun, 24 Apr 2022 22:01:39 +0000 (23:01 +0100)
committerPaolo Bonzini <pbonzini@redhat.com>
Thu, 1 Sep 2022 18:16:33 +0000 (20:16 +0200)
Perpare the horizontal atithmetic vector helpers for AVX
These currently use a dummy Reg typed variable to store the result then
assign the whole register.  This will cause 128 bit operations to corrupt
the upper half of the register, so replace it with explicit temporaries
and element assignments.

Signed-off-by: Paul Brook <paul@nowt.org>
Message-Id: <20220424220204.2493824-18-paul@nowt.org>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
target/i386/ops_sse.h

index c9737e16b951ce0f80ce824d20837ab44ddd0123..c6dba9572dea79ebfc323701b3ea4cc55c7c2e53 100644 (file)
@@ -22,7 +22,6 @@
 
 #if SHIFT == 0
 #define Reg MMXReg
-#define SIZE 8
 #define XMM_ONLY(...)
 #define B(n) MMX_B(n)
 #define W(n) MMX_W(n)
@@ -31,7 +30,6 @@
 #define SUFFIX _mmx
 #else
 #define Reg ZMMReg
-#define SIZE 16
 #define XMM_ONLY(...) __VA_ARGS__
 #define B(n) ZMM_B(n)
 #define W(n) ZMM_W(n)
 #define LANE_WIDTH (SHIFT ? 16 : 8)
 #define PACK_WIDTH (LANE_WIDTH / 2)
 
-/*
- * Copy the relevant parts of a Reg value around. In the case where
- * sizeof(Reg) > SIZE, these helpers operate only on the lower bytes of
- * a 64 byte ZMMReg, so we must copy only those and keep the top bytes
- * untouched in the guest-visible destination destination register.
- * Note that the "lower bytes" are placed last in memory on big-endian
- * hosts, which store the vector backwards in memory.  In that case the
- * copy *starts* at B(SIZE - 1) and ends at B(0), the opposite of
- * the little-endian case.
- */
-#if HOST_BIG_ENDIAN
-#define MOVE(d, r) memcpy(&((d).B(SIZE - 1)), &(r).B(SIZE - 1), SIZE)
-#else
-#define MOVE(d, r) memcpy(&(d).B(0), &(r).B(0), SIZE)
-#endif
-
 #if SHIFT == 0
 #define FPSRL(x, c) ((x) >> shift)
 #define FPSRAW(x, c) ((int16_t)(x) >> shift)
@@ -945,45 +927,49 @@ void helper_insertq_i(CPUX86State *env, ZMMReg *d, int index, int length)
     d->ZMM_Q(0) = helper_insertq(d->ZMM_Q(0), index, length);
 }
 
-void glue(helper_haddps, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s)
-{
-    ZMMReg r;
-
-    r.ZMM_S(0) = float32_add(d->ZMM_S(0), d->ZMM_S(1), &env->sse_status);
-    r.ZMM_S(1) = float32_add(d->ZMM_S(2), d->ZMM_S(3), &env->sse_status);
-    r.ZMM_S(2) = float32_add(s->ZMM_S(0), s->ZMM_S(1), &env->sse_status);
-    r.ZMM_S(3) = float32_add(s->ZMM_S(2), s->ZMM_S(3), &env->sse_status);
-    MOVE(*d, r);
-}
-
-void glue(helper_haddpd, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s)
-{
-    ZMMReg r;
-
-    r.ZMM_D(0) = float64_add(d->ZMM_D(0), d->ZMM_D(1), &env->sse_status);
-    r.ZMM_D(1) = float64_add(s->ZMM_D(0), s->ZMM_D(1), &env->sse_status);
-    MOVE(*d, r);
-}
-
-void glue(helper_hsubps, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s)
-{
-    ZMMReg r;
-
-    r.ZMM_S(0) = float32_sub(d->ZMM_S(0), d->ZMM_S(1), &env->sse_status);
-    r.ZMM_S(1) = float32_sub(d->ZMM_S(2), d->ZMM_S(3), &env->sse_status);
-    r.ZMM_S(2) = float32_sub(s->ZMM_S(0), s->ZMM_S(1), &env->sse_status);
-    r.ZMM_S(3) = float32_sub(s->ZMM_S(2), s->ZMM_S(3), &env->sse_status);
-    MOVE(*d, r);
-}
-
-void glue(helper_hsubpd, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s)
-{
-    ZMMReg r;
-
-    r.ZMM_D(0) = float64_sub(d->ZMM_D(0), d->ZMM_D(1), &env->sse_status);
-    r.ZMM_D(1) = float64_sub(s->ZMM_D(0), s->ZMM_D(1), &env->sse_status);
-    MOVE(*d, r);
-}
+#define SSE_HELPER_HPS(name, F)  \
+void glue(helper_ ## name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \
+{                                                                 \
+    Reg *v = d;                                                   \
+    float32 r[2 << SHIFT];                                        \
+    int i, j, k;                                                  \
+    for (k = 0; k < 2 << SHIFT; k += LANE_WIDTH / 4) {            \
+        for (i = j = 0; j < 4; i++, j += 2) {                     \
+            r[i + k] = F(v->ZMM_S(j + k), v->ZMM_S(j + k + 1), &env->sse_status); \
+        }                                                         \
+        for (j = 0; j < 4; i++, j += 2) {                         \
+            r[i + k] = F(s->ZMM_S(j + k), s->ZMM_S(j + k + 1), &env->sse_status); \
+        }                                                         \
+    }                                                             \
+    for (i = 0; i < 2 << SHIFT; i++) {                            \
+        d->ZMM_S(i) = r[i];                                       \
+    }                                                             \
+}
+
+SSE_HELPER_HPS(haddps, float32_add)
+SSE_HELPER_HPS(hsubps, float32_sub)
+
+#define SSE_HELPER_HPD(name, F)  \
+void glue(helper_ ## name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \
+{                                                                 \
+    Reg *v = d;                                                   \
+    float64 r[1 << SHIFT];                                        \
+    int i, j, k;                                                  \
+    for (k = 0; k < 1 << SHIFT; k += LANE_WIDTH / 8) {            \
+        for (i = j = 0; j < 2; i++, j += 2) {                     \
+            r[i + k] = F(v->ZMM_D(j + k), v->ZMM_D(j + k + 1), &env->sse_status); \
+        }                                                         \
+        for (j = 0; j < 2; i++, j += 2) {                         \
+            r[i + k] = F(s->ZMM_D(j + k), s->ZMM_D(j + k + 1), &env->sse_status); \
+        }                                                         \
+    }                                                             \
+    for (i = 0; i < 1 << SHIFT; i++) {                            \
+        d->ZMM_D(i) = r[i];                                       \
+    }                                                             \
+}
+
+SSE_HELPER_HPD(haddpd, float64_add)
+SSE_HELPER_HPD(hsubpd, float64_sub)
 
 void glue(helper_addsubps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
 {
@@ -2331,4 +2317,3 @@ void glue(helper_aeskeygenassist, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
 #undef L
 #undef Q
 #undef SUFFIX
-#undef SIZE