ENTRY(simd_test);
-#if VEC_SIZE == 8 && defined(__SSE__)
+#if defined(__AVX512F__)
+# define ALL_TRUE (~0ULL >> (64 - ELEM_COUNT))
+# if VEC_SIZE == 4
+# define eq(x, y) ({ \
+ float x_ = (x)[0]; \
+ float __attribute__((vector_size(16))) y_ = { (y)[0] }; \
+ unsigned short r_; \
+ asm ( "vcmpss $0, %1, %2, %0" : "=k" (r_) : "m" (x_), "v" (y_) ); \
+ r_ == 1; \
+})
+# elif VEC_SIZE == 8
+# define eq(x, y) ({ \
+ double x_ = (x)[0]; \
+ double __attribute__((vector_size(16))) y_ = { (y)[0] }; \
+ unsigned short r_; \
+ asm ( "vcmpsd $0, %1, %2, %0" : "=k" (r_) : "m" (x_), "v" (y_) ); \
+ r_ == 1; \
+})
+# elif FLOAT_SIZE == 4
+/*
+ * gcc's (up to at least 8.2) __builtin_ia32_cmpps256_mask() has an anomaly in
+ * that its return type is QI rather than UQI, and hence the value would get
+ * sign-extended before comapring to ALL_TRUE. The same oddity does not matter
+ * for __builtin_ia32_cmppd256_mask(), as there only 4 bits are significant.
+ * Hence the extra " & ALL_TRUE".
+ */
+# define eq(x, y) ((BR(cmpps, _mask, x, y, 0, -1) & ALL_TRUE) == ALL_TRUE)
+# elif FLOAT_SIZE == 8
+# define eq(x, y) (BR(cmppd, _mask, x, y, 0, -1) == ALL_TRUE)
+# elif INT_SIZE == 4 || UINT_SIZE == 4
+# define eq(x, y) (B(pcmpeqd, _mask, (vsi_t)(x), (vsi_t)(y), -1) == ALL_TRUE)
+# elif INT_SIZE == 8 || UINT_SIZE == 8
+# define eq(x, y) (B(pcmpeqq, _mask, (vdi_t)(x), (vdi_t)(y), -1) == ALL_TRUE)
+# endif
+#elif VEC_SIZE == 8 && defined(__SSE__)
# define to_bool(cmp) (__builtin_ia32_pmovmskb(cmp) == 0xff)
#elif VEC_SIZE == 16
# if defined(__AVX__) && defined(FLOAT_SIZE)
touch(x); \
__builtin_ia32_pfrcpit2(__builtin_ia32_pfrsqit1(__builtin_ia32_pfmul(t_, t_), x), t_); \
})
+#elif defined(FLOAT_SIZE) && VEC_SIZE == FLOAT_SIZE && defined(__AVX512F__)
+# if FLOAT_SIZE == 4
+# define sqrt(x) scalar_1op(x, "vsqrtss %[in], %[out], %[out]")
+# elif FLOAT_SIZE == 8
+# define sqrt(x) scalar_1op(x, "vsqrtsd %[in], %[out], %[out]")
+# endif
+#elif defined(FLOAT_SIZE) && defined(__AVX512F__) && \
+ (VEC_SIZE == 64 || defined(__AVX512VL__))
+# if FLOAT_SIZE == 4
+# define broadcast(x) ({ \
+ vec_t t_; \
+ asm ( "%{evex%} vbroadcastss %1, %0" \
+ : "=v" (t_) : "m" (*(float[1]){ x }) ); \
+ t_; \
+})
+# define max(x, y) BR_(maxps, _mask, x, y, undef(), ~0)
+# define min(x, y) BR_(minps, _mask, x, y, undef(), ~0)
+# define mix(x, y) B(movaps, _mask, x, y, (0b0101010101010101 & ALL_TRUE))
+# define sqrt(x) BR(sqrtps, _mask, x, undef(), ~0)
+# if VEC_SIZE == 16
+# define interleave_hi(x, y) B(unpckhps, _mask, x, y, undef(), ~0)
+# define interleave_lo(x, y) B(unpcklps, _mask, x, y, undef(), ~0)
+# define swap(x) B(shufps, _mask, x, x, 0b00011011, undef(), ~0)
+# endif
+# elif FLOAT_SIZE == 8
+# if VEC_SIZE >= 32
+# define broadcast(x) ({ \
+ vec_t t_; \
+ asm ( "%{evex%} vbroadcastsd %1, %0" : "=v" (t_) \
+ : "m" (*(double[1]){ x }) ); \
+ t_; \
+})
+# else
+# define broadcast(x) ({ \
+ vec_t t_; \
+ asm ( "%{evex%} vpbroadcastq %1, %0" \
+ : "=v" (t_) : "m" (*(double[1]){ x }) ); \
+ t_; \
+})
+# endif
+# define max(x, y) BR_(maxpd, _mask, x, y, undef(), ~0)
+# define min(x, y) BR_(minpd, _mask, x, y, undef(), ~0)
+# define mix(x, y) B(movapd, _mask, x, y, 0b01010101)
+# define sqrt(x) BR(sqrtpd, _mask, x, undef(), ~0)
+# if VEC_SIZE == 16
+# define interleave_hi(x, y) B(unpckhpd, _mask, x, y, undef(), ~0)
+# define interleave_lo(x, y) B(unpcklpd, _mask, x, y, undef(), ~0)
+# define swap(x) B(shufpd, _mask, x, x, 0b01, undef(), ~0)
+# endif
+# endif
#elif FLOAT_SIZE == 4 && defined(__SSE__)
# if VEC_SIZE == 32 && defined(__AVX__)
# if defined(__AVX2__)
# define sqrt(x) scalar_1op(x, "sqrtsd %[in], %[out]")
# endif
#endif
-#if VEC_SIZE == 16 && defined(__SSE2__)
+#if (INT_SIZE == 4 || UINT_SIZE == 4 || INT_SIZE == 8 || UINT_SIZE == 8) && \
+ defined(__AVX512F__) && (VEC_SIZE == 64 || defined(__AVX512VL__))
+# if INT_SIZE == 4 || UINT_SIZE == 4
+# define mix(x, y) ((vec_t)B(movdqa32_, _mask, (vsi_t)(x), (vsi_t)(y), \
+ (0b0101010101010101 & ((1 << ELEM_COUNT) - 1))))
+# elif INT_SIZE == 8 || UINT_SIZE == 8
+# define mix(x, y) ((vec_t)B(movdqa64_, _mask, (vdi_t)(x), (vdi_t)(y), 0b01010101))
+# endif
+# if INT_SIZE == 4
+# define max(x, y) B(pmaxsd, _mask, x, y, undef(), ~0)
+# define min(x, y) B(pminsd, _mask, x, y, undef(), ~0)
+# define mul_full(x, y) ((vec_t)B(pmuldq, _mask, x, y, (vdi_t)undef(), ~0))
+# elif UINT_SIZE == 4
+# define max(x, y) ((vec_t)B(pmaxud, _mask, (vsi_t)(x), (vsi_t)(y), (vsi_t)undef(), ~0))
+# define min(x, y) ((vec_t)B(pminud, _mask, (vsi_t)(x), (vsi_t)(y), (vsi_t)undef(), ~0))
+# define mul_full(x, y) ((vec_t)B(pmuludq, _mask, (vsi_t)(x), (vsi_t)(y), (vdi_t)undef(), ~0))
+# elif INT_SIZE == 8
+# define max(x, y) ((vec_t)B(pmaxsq, _mask, (vdi_t)(x), (vdi_t)(y), (vdi_t)undef(), ~0))
+# define min(x, y) ((vec_t)B(pminsq, _mask, (vdi_t)(x), (vdi_t)(y), (vdi_t)undef(), ~0))
+# elif UINT_SIZE == 8
+# define max(x, y) ((vec_t)B(pmaxuq, _mask, (vdi_t)(x), (vdi_t)(y), (vdi_t)undef(), ~0))
+# define min(x, y) ((vec_t)B(pminuq, _mask, (vdi_t)(x), (vdi_t)(y), (vdi_t)undef(), ~0))
+# endif
+#elif VEC_SIZE == 16 && defined(__SSE2__)
# if INT_SIZE == 1 || UINT_SIZE == 1
# define interleave_hi(x, y) ((vec_t)__builtin_ia32_punpckhbw128((vqi_t)(x), (vqi_t)(y)))
# define interleave_lo(x, y) ((vec_t)__builtin_ia32_punpcklbw128((vqi_t)(x), (vqi_t)(y)))
# endif
#endif
+#if defined(__AVX512F__) && defined(FLOAT_SIZE)
+# include "simd-fma.c"
+#endif
+
int simd_test(void)
{
unsigned int i, j;
# endif
#endif
-#if defined(__XOP__) && VEC_SIZE == 16 && (INT_SIZE == 2 || INT_SIZE == 4)
+#if (defined(__XOP__) && VEC_SIZE == 16 && (INT_SIZE == 2 || INT_SIZE == 4)) || \
+ (defined(__AVX512F__) && defined(FLOAT_SIZE))
return -fma_test();
#endif
typedef long long __attribute__((vector_size(VEC_SIZE))) vdi_t;
#endif
+#if VEC_SIZE == 16
+# define B(n, s, a...) __builtin_ia32_ ## n ## 128 ## s(a)
+# define B_(n, s, a...) __builtin_ia32_ ## n ## s(a)
+#elif VEC_SIZE == 32
+# define B(n, s, a...) __builtin_ia32_ ## n ## 256 ## s(a)
+#elif VEC_SIZE == 64
+# define B(n, s, a...) __builtin_ia32_ ## n ## 512 ## s(a)
+# define BR(n, s, a...) __builtin_ia32_ ## n ## 512 ## s(a, 4)
+#endif
+#ifndef B_
+# define B_ B
+#endif
+#ifndef BR
+# define BR B
+# define BR_ B_
+#endif
+#ifndef BR_
+# define BR_ BR
+#endif
+
+#ifdef __AVX512F__
+
+/*
+ * The original plan was to effect use of EVEX encodings for scalar as well as
+ * 128- and 256-bit insn variants by restricting the compiler to use (on 64-bit
+ * only of course) XMM16-XMM31 only. All sorts of compiler errors result when
+ * doing this with gcc 8.2. Therefore resort to injecting {evex} prefixes,
+ * which has the benefit of also working for 32-bit. Granted, there is a lot of
+ * escaping to get right here.
+ */
+asm ( ".macro override insn \n\t"
+ ".macro $\\insn o:vararg \n\t"
+ ".purgem \\insn \n\t"
+ "{evex} \\insn \\(\\)o \n\t"
+ ".macro \\insn o:vararg \n\t"
+ "$\\insn \\(\\(\\))o \n\t"
+ ".endm \n\t"
+ ".endm \n\t"
+ ".macro \\insn o:vararg \n\t"
+ "$\\insn \\(\\)o \n\t"
+ ".endm \n\t"
+ ".endm" );
+
+# define OVR(n) asm ( "override v" #n )
+# define OVR_SFP(n) OVR(n ## sd); OVR(n ## ss)
+
+# ifdef __AVX512VL__
+# ifdef __AVX512BW__
+# define OVR_BW(n) OVR(p ## n ## b); OVR(p ## n ## w)
+# else
+# define OVR_BW(n)
+# endif
+# define OVR_DQ(n) OVR(p ## n ## d); OVR(p ## n ## q)
+# define OVR_VFP(n) OVR(n ## pd); OVR(n ## ps)
+# else
+# define OVR_BW(n)
+# define OVR_DQ(n)
+# define OVR_VFP(n)
+# endif
+
+# define OVR_FMA(n, w) OVR_ ## w(n ## 132); OVR_ ## w(n ## 213); \
+ OVR_ ## w(n ## 231)
+# define OVR_FP(n) OVR_VFP(n); OVR_SFP(n)
+# define OVR_INT(n) OVR_BW(n); OVR_DQ(n)
+
+OVR_SFP(broadcast);
+OVR_SFP(comi);
+OVR_FP(add);
+OVR_FP(div);
+OVR(extractps);
+OVR_FMA(fmadd, FP);
+OVR_FMA(fmsub, FP);
+OVR_FMA(fnmadd, FP);
+OVR_FMA(fnmsub, FP);
+OVR(insertps);
+OVR_FP(max);
+OVR_FP(min);
+OVR(movd);
+OVR(movq);
+OVR_SFP(mov);
+OVR_FP(mul);
+OVR_FP(sqrt);
+OVR_FP(sub);
+OVR_SFP(ucomi);
+
+# undef OVR_VFP
+# undef OVR_SFP
+# undef OVR_INT
+# undef OVR_FP
+# undef OVR_FMA
+# undef OVR_DQ
+# undef OVR_BW
+# undef OVR
+
+#endif /* __AVX512F__ */
+
/*
* Suppress value propagation by the compiler, preventing unwanted
* optimization. This at once makes the compiler use memory operands
* more often, which for our purposes is the more interesting case.
*/
#define touch(var) asm volatile ( "" : "+m" (var) )
+
+static inline vec_t undef(void)
+{
+ vec_t v = v;
+ return v;
+}