# endif
#elif defined(FLOAT_SIZE) && defined(__AVX512F__) && \
(VEC_SIZE == 64 || defined(__AVX512VL__))
+# if ELEM_COUNT == 8 /* vextractf{32,64}x4 */ || \
+ (ELEM_COUNT == 16 && ELEM_SIZE == 4 && defined(__AVX512DQ__)) /* vextractf32x8 */ || \
+ (ELEM_COUNT == 4 && ELEM_SIZE == 8 && defined(__AVX512DQ__)) /* vextractf64x2 */
+# define low_half(x) ({ \
+ half_t t_; \
+ asm ( "vextractf%c[w]x%c[n] $0, %[s], %[d]" \
+ : [d] "=m" (t_) \
+ : [s] "v" (x), [w] "i" (ELEM_SIZE * 8), [n] "i" (ELEM_COUNT / 2) ); \
+ t_; \
+})
+# endif
+# if (ELEM_COUNT == 16 && ELEM_SIZE == 4) /* vextractf32x4 */ || \
+ (ELEM_COUNT == 8 && ELEM_SIZE == 8 && defined(__AVX512DQ__)) /* vextractf64x2 */
+# define low_quarter(x) ({ \
+ quarter_t t_; \
+ asm ( "vextractf%c[w]x%c[n] $0, %[s], %[d]" \
+ : [d] "=m" (t_) \
+ : [s] "v" (x), [w] "i" (ELEM_SIZE * 8), [n] "i" (ELEM_COUNT / 4) ); \
+ t_; \
+})
+# endif
# if FLOAT_SIZE == 4
# define broadcast(x) ({ \
vec_t t_; \
: "=v" (t_) : "m" (*(float[1]){ x }) ); \
t_; \
})
+# if VEC_SIZE >= 32 && defined(__AVX512DQ__)
+# define broadcast_pair(x) ({ \
+ vec_t t_; \
+ asm ( "vbroadcastf32x2 %1, %0" : "=v" (t_) : "m" (x) ); \
+ t_; \
+})
+# endif
+# if VEC_SIZE == 64 && defined(__AVX512DQ__)
+# define broadcast_octet(x) B(broadcastf32x8_, _mask, x, undef(), ~0)
+# define insert_octet(x, y, p) B(insertf32x8_, _mask, x, y, p, undef(), ~0)
+# endif
# define max(x, y) BR_(maxps, _mask, x, y, undef(), ~0)
# define min(x, y) BR_(minps, _mask, x, y, undef(), ~0)
# define mix(x, y) B(movaps, _mask, x, y, (0b0101010101010101 & ALL_TRUE))
# define interleave_lo(x, y) B(unpcklps, _mask, x, y, undef(), ~0)
# define swap(x) B(shufps, _mask, x, x, 0b00011011, undef(), ~0)
# else
+# define broadcast_quartet(x) B(broadcastf32x4_, _mask, x, undef(), ~0)
+# define insert_pair(x, y, p) \
+ B(insertf32x4_, _mask, x, \
+ /* Cast needed below to work around gcc 7.x quirk. */ \
+ (p) & 1 ? (typeof(y))__builtin_ia32_shufps(y, y, 0b01000100) : (y), \
+ (p) >> 1, x, 3 << ((p) * 2))
+# define insert_quartet(x, y, p) B(insertf32x4_, _mask, x, y, p, undef(), ~0)
# define interleave_hi(x, y) B(vpermi2varps, _mask, x, interleave_hi, y, ~0)
# define interleave_lo(x, y) B(vpermt2varps, _mask, interleave_lo, x, y, ~0)
# define swap(x) ({ \
t_; \
})
# endif
+# if VEC_SIZE >= 32 && defined(__AVX512DQ__)
+# define broadcast_pair(x) B(broadcastf64x2_, _mask, x, undef(), ~0)
+# define insert_pair(x, y, p) B(insertf64x2_, _mask, x, y, p, undef(), ~0)
+# endif
+# if VEC_SIZE == 64
+# define broadcast_quartet(x) B(broadcastf64x4_, , x, undef(), ~0)
+# define insert_quartet(x, y, p) B(insertf64x4_, _mask, x, y, p, undef(), ~0)
+# endif
# define max(x, y) BR_(maxpd, _mask, x, y, undef(), ~0)
# define min(x, y) BR_(minpd, _mask, x, y, undef(), ~0)
# define mix(x, y) B(movapd, _mask, x, y, 0b01010101)
t_; \
})
# endif
+# if (ELEM_COUNT == 16 && ELEM_SIZE == 4) /* vextracti32x4 */ || \
+ (ELEM_COUNT == 8 && ELEM_SIZE == 8 && defined(__AVX512DQ__)) /* vextracti64x2 */
+# define low_quarter(x) ({ \
+ quarter_t t_; \
+ asm ( "vextracti%c[w]x%c[n] $0, %[s], %[d]" \
+ : [d] "=m" (t_) \
+ : [s] "v" (x), [w] "i" (ELEM_SIZE * 8), [n] "i" (ELEM_COUNT / 4) ); \
+ t_; \
+})
+# endif
# if INT_SIZE == 4 || UINT_SIZE == 4
# define broadcast(x) ({ \
vec_t t_; \
asm ( "vpbroadcastd %k1, %0" : "=v" (t_) : "r" (x) ); \
t_; \
})
+# ifdef __AVX512DQ__
+# define broadcast_pair(x) ({ \
+ vec_t t_; \
+ asm ( "vbroadcasti32x2 %1, %0" : "=v" (t_) : "m" (x) ); \
+ t_; \
+})
+# endif
+# if VEC_SIZE == 64 && defined(__AVX512DQ__)
+# define broadcast_octet(x) ((vec_t)B(broadcasti32x8_, _mask, (vsi_octet_t)(x), (vsi_t)undef(), ~0))
+# define insert_octet(x, y, p) ((vec_t)B(inserti32x8_, _mask, (vsi_t)(x), (vsi_octet_t)(y), p, (vsi_t)undef(), ~0))
+# endif
# if VEC_SIZE == 16
# define interleave_hi(x, y) ((vec_t)B(punpckhdq, _mask, (vsi_t)(x), (vsi_t)(y), (vsi_t)undef(), ~0))
# define interleave_lo(x, y) ((vec_t)B(punpckldq, _mask, (vsi_t)(x), (vsi_t)(y), (vsi_t)undef(), ~0))
# define swap(x) ((vec_t)B(pshufd, _mask, (vsi_t)(x), 0b00011011, (vsi_t)undef(), ~0))
# else
+# define broadcast_quartet(x) ((vec_t)B(broadcasti32x4_, _mask, (vsi_quartet_t)(x), (vsi_t)undef(), ~0))
+# define insert_pair(x, y, p) \
+ (vec_t)(B(inserti32x4_, _mask, (vsi_t)(x), \
+ /* First cast needed below to work around gcc 7.x quirk. */ \
+ (p) & 1 ? (vsi_pair_t)__builtin_ia32_pshufd((vsi_pair_t)(y), 0b01000100) \
+ : (vsi_pair_t)(y), \
+ (p) >> 1, (vsi_t)(x), 3 << ((p) * 2)))
+# define insert_quartet(x, y, p) ((vec_t)B(inserti32x4_, _mask, (vsi_t)(x), (vsi_quartet_t)(y), p, (vsi_t)undef(), ~0))
# define interleave_hi(x, y) ((vec_t)B(vpermi2vard, _mask, (vsi_t)(x), interleave_hi, (vsi_t)(y), ~0))
# define interleave_lo(x, y) ((vec_t)B(vpermt2vard, _mask, interleave_lo, (vsi_t)(x), (vsi_t)(y), ~0))
# define swap(x) ((vec_t)B(pshufd, _mask, \
t_; \
})
# endif
+# if VEC_SIZE >= 32 && defined(__AVX512DQ__)
+# define broadcast_pair(x) ((vec_t)B(broadcasti64x2_, _mask, (vdi_pair_t)(x), (vdi_t)undef(), ~0))
+# define insert_pair(x, y, p) ((vec_t)B(inserti64x2_, _mask, (vdi_t)(x), (vdi_pair_t)(y), p, (vdi_t)undef(), ~0))
+# endif
+# if VEC_SIZE == 64
+# define broadcast_quartet(x) ((vec_t)B(broadcasti64x4_, , (vdi_quartet_t)(x), (vdi_t)undef(), ~0))
+# define insert_quartet(x, y, p) ((vec_t)B(inserti64x4_, _mask, (vdi_t)(x), (vdi_quartet_t)(y), p, (vdi_t)undef(), ~0))
+# endif
# if VEC_SIZE == 16
# define interleave_hi(x, y) ((vec_t)B(punpckhqdq, _mask, (vdi_t)(x), (vdi_t)(y), (vdi_t)undef(), ~0))
# define interleave_lo(x, y) ((vec_t)B(punpcklqdq, _mask, (vdi_t)(x), (vdi_t)(y), (vdi_t)undef(), ~0))
eighth_t y;
unsigned int i;
- for ( i = 0; i < ELEM_COUNT / 4; ++i )
+ for ( i = 0; i < ELEM_COUNT / 8; ++i )
y[i] = x[i];
return y;
#endif
+#ifdef broadcast_pair
+# if ELEM_COUNT == 4
+# define broadcast_half broadcast_pair
+# elif ELEM_COUNT == 8
+# define broadcast_quarter broadcast_pair
+# elif ELEM_COUNT == 16
+# define broadcast_eighth broadcast_pair
+# endif
+#endif
+
+#ifdef insert_pair
+# if ELEM_COUNT == 4
+# define insert_half insert_pair
+# elif ELEM_COUNT == 8
+# define insert_quarter insert_pair
+# elif ELEM_COUNT == 16
+# define insert_eighth insert_pair
+# endif
+#endif
+
+#ifdef broadcast_quartet
+# if ELEM_COUNT == 8
+# define broadcast_half broadcast_quartet
+# elif ELEM_COUNT == 16
+# define broadcast_quarter broadcast_quartet
+# endif
+#endif
+
+#ifdef insert_quartet
+# if ELEM_COUNT == 8
+# define insert_half insert_quartet
+# elif ELEM_COUNT == 16
+# define insert_quarter insert_quartet
+# endif
+#endif
+
+#if defined(broadcast_octet) && ELEM_COUNT == 16
+# define broadcast_half broadcast_octet
+#endif
+
+#if defined(insert_octet) && ELEM_COUNT == 16
+# define insert_half insert_octet
+#endif
+
#if defined(__AVX512F__) && defined(FLOAT_SIZE)
# include "simd-fma.c"
#endif
if ( !eq(broadcast2(ELEM_COUNT + 1), src + inv) ) return __LINE__;
#endif
+#if defined(broadcast_half) && defined(insert_half)
+ {
+ half_t aux = low_half(src);
+
+ touch(aux);
+ x = broadcast_half(aux);
+ touch(aux);
+ y = insert_half(src, aux, 1);
+ if ( !eq(x, y) ) return __LINE__;
+ }
+#endif
+
+#if defined(broadcast_quarter) && defined(insert_quarter)
+ {
+ quarter_t aux = low_quarter(src);
+
+ touch(aux);
+ x = broadcast_quarter(aux);
+ touch(aux);
+ y = insert_quarter(src, aux, 1);
+ touch(aux);
+ y = insert_quarter(y, aux, 2);
+ touch(aux);
+ y = insert_quarter(y, aux, 3);
+ if ( !eq(x, y) ) return __LINE__;
+ }
+#endif
+
+#if defined(broadcast_eighth) && defined(insert_eighth) && \
+ /* At least gcc 7.3 "optimizes" away all insert_eighth() calls below. */ \
+ __GNUC__ >= 8
+ {
+ eighth_t aux = low_eighth(src);
+
+ touch(aux);
+ x = broadcast_eighth(aux);
+ touch(aux);
+ y = insert_eighth(src, aux, 1);
+ touch(aux);
+ y = insert_eighth(y, aux, 2);
+ touch(aux);
+ y = insert_eighth(y, aux, 3);
+ touch(aux);
+ y = insert_eighth(y, aux, 4);
+ touch(aux);
+ y = insert_eighth(y, aux, 5);
+ touch(aux);
+ y = insert_eighth(y, aux, 6);
+ touch(aux);
+ y = insert_eighth(y, aux, 7);
+ if ( !eq(x, y) ) return __LINE__;
+ }
+#endif
+
#if defined(interleave_lo) && defined(interleave_hi)
touch(src);
x = interleave_lo(inv, src);
#include "avx512bw-opmask.h"
#include "avx512f.h"
#include "avx512bw.h"
+#include "avx512dq.h"
#define verbose false /* Switch to true for far more logging. */
}
#define simd_check_avx512dq_opmask simd_check_avx512dq
+static bool simd_check_avx512dq_vl(void)
+{
+ return cpu_has_avx512dq && cpu_has_avx512vl;
+}
+
static bool simd_check_avx512bw(void)
{
return cpu_has_avx512bw;
SIMD(XOP i32x8, xop, 32i4),
SIMD(XOP i64x4, xop, 32i8),
SIMD(OPMASK/w, avx512f_opmask, 2),
- SIMD(OPMASK/b, avx512dq_opmask, 1),
- SIMD(OPMASK/d, avx512bw_opmask, 4),
- SIMD(OPMASK/q, avx512bw_opmask, 8),
+ SIMD(OPMASK+DQ/b, avx512dq_opmask, 1),
+ SIMD(OPMASK+DQ/w, avx512dq_opmask, 2),
+ SIMD(OPMASK+BW/d, avx512bw_opmask, 4),
+ SIMD(OPMASK+BW/q, avx512bw_opmask, 8),
SIMD(AVX512F f32 scalar, avx512f, f4),
SIMD(AVX512F f32x16, avx512f, 64f4),
SIMD(AVX512F f64 scalar, avx512f, f8),
AVX512VL(BW+VL u16x8, avx512bw, 16u2),
AVX512VL(BW+VL s16x16, avx512bw, 32i2),
AVX512VL(BW+VL u16x16, avx512bw, 32u2),
+ SIMD(AVX512DQ f32x16, avx512dq, 64f4),
+ SIMD(AVX512DQ f64x8, avx512dq, 64f8),
+ SIMD(AVX512DQ s32x16, avx512dq, 64i4),
+ SIMD(AVX512DQ u32x16, avx512dq, 64u4),
+ SIMD(AVX512DQ s64x8, avx512dq, 64i8),
+ SIMD(AVX512DQ u64x8, avx512dq, 64u8),
+ AVX512VL(DQ+VL f32x4, avx512dq, 16f4),
+ AVX512VL(DQ+VL f64x2, avx512dq, 16f8),
+ AVX512VL(DQ+VL f32x8, avx512dq, 32f4),
+ AVX512VL(DQ+VL f64x4, avx512dq, 32f8),
+ AVX512VL(DQ+VL s32x4, avx512dq, 16i4),
+ AVX512VL(DQ+VL u32x4, avx512dq, 16u4),
+ AVX512VL(DQ+VL s32x8, avx512dq, 32i4),
+ AVX512VL(DQ+VL u32x8, avx512dq, 32u4),
+ AVX512VL(DQ+VL s64x2, avx512dq, 16i8),
+ AVX512VL(DQ+VL u64x2, avx512dq, 16u8),
+ AVX512VL(DQ+VL s64x4, avx512dq, 32i8),
+ AVX512VL(DQ+VL u64x4, avx512dq, 32u8),
#undef AVX512VL_
#undef AVX512VL
#undef SIMD_