sse2-vecs := $(sse-vecs)
sse2-ints := 1 2 4 8
sse2-flts := 4 8
+sse4-vecs := $(sse2-vecs)
+sse4-ints := $(sse2-ints)
+sse4-flts := $(sse2-flts)
# When converting SSE to AVX, have the compiler avoid XMM0 to widen
-# coverage of the VEX.vvvv checks in the emulator.
-sse2avx := -ffixed-xmm0 -Wa,-msse2avx
-
-simd-cflags := $(foreach flavor,sse sse2, \
+# coverage of the VEX.vvvv checks in the emulator. We must not do this,
+# however, for SSE4.1 and later, as there are instructions with XMM0 as
+# an implicit operand.
+sse2avx-sse := -ffixed-xmm0 -Wa,-msse2avx
+sse2avx-sse2 := $(sse2avx-sse)
+sse2avx-sse4 := -Wa,-msse2avx
+
+simd-cflags := $(foreach flavor,sse sse2 sse4, \
$(foreach vec,$($(flavor)-vecs), \
$(foreach int,$($(flavor)-ints), \
"-D$(flavor)_$(vec)i$(int) -m$(flavor) -O2 -DVEC_SIZE=$(vec) -DINT_SIZE=$(int)" \
"-D$(flavor)_$(vec)u$(int) -m$(flavor) -O2 -DVEC_SIZE=$(vec) -DUINT_SIZE=$(int)" \
- "-D$(flavor)_avx_$(vec)i$(int) -m$(flavor) $(sse2avx) -O2 -DVEC_SIZE=$(vec) -DINT_SIZE=$(int)" \
- "-D$(flavor)_avx_$(vec)u$(int) -m$(flavor) $(sse2avx) -O2 -DVEC_SIZE=$(vec) -DUINT_SIZE=$(int)") \
+ "-D$(flavor)_avx_$(vec)i$(int) -m$(flavor) $(sse2avx-$(flavor)) -O2 -DVEC_SIZE=$(vec) -DINT_SIZE=$(int)" \
+ "-D$(flavor)_avx_$(vec)u$(int) -m$(flavor) $(sse2avx-$(flavor)) -O2 -DVEC_SIZE=$(vec) -DUINT_SIZE=$(int)") \
$(foreach flt,$($(flavor)-flts), \
"-D$(flavor)_$(vec)f$(flt) -m$(flavor) -O2 -DVEC_SIZE=$(vec) -DFLOAT_SIZE=$(flt)" \
- "-D$(flavor)_avx_$(vec)f$(flt) -m$(flavor) $(sse2avx) -O2 -DVEC_SIZE=$(vec) -DFLOAT_SIZE=$(flt)")) \
+ "-D$(flavor)_avx_$(vec)f$(flt) -m$(flavor) $(sse2avx-$(flavor)) -O2 -DVEC_SIZE=$(vec) -DFLOAT_SIZE=$(flt)")) \
$(foreach flt,$($(flavor)-flts), \
"-D$(flavor)_f$(flt) -m$(flavor) -mfpmath=sse -O2 -DFLOAT_SIZE=$(flt)" \
- "-D$(flavor)_avx_f$(flt) -m$(flavor) -mfpmath=sse $(sse2avx) -O2 -DFLOAT_SIZE=$(flt)"))
+ "-D$(flavor)_avx_f$(flt) -m$(flavor) -mfpmath=sse $(sse2avx-$(flavor)) -O2 -DFLOAT_SIZE=$(flt)"))
$(addsuffix .h,$(TESTCASES)): %.h: %.c testcase.mk Makefile
rm -f $@.new $*.bin
#if VEC_SIZE == 8 && defined(__SSE__)
# define to_bool(cmp) (__builtin_ia32_pmovmskb(cmp) == 0xff)
#elif VEC_SIZE == 16
-# if defined(__SSE__) && ELEM_SIZE == 4
+# if defined(__SSE4_1__)
+# define to_bool(cmp) __builtin_ia32_ptestc128(cmp, (vdi_t){} == 0)
+# elif defined(__SSE__) && ELEM_SIZE == 4
# define to_bool(cmp) (__builtin_ia32_movmskps(cmp) == 0xf)
# elif defined(__SSE2__)
# if ELEM_SIZE == 8
__builtin_ia32_maskmovdqu((vqi_t)(y), ~m_, d_); \
})
#endif
+#if VEC_SIZE == 16 && defined(__SSE3__)
+# if FLOAT_SIZE == 4
+# define addsub(x, y) __builtin_ia32_addsubps(x, y)
+# define dup_hi(x) __builtin_ia32_movshdup(x)
+# define dup_lo(x) __builtin_ia32_movsldup(x)
+# define hadd(x, y) __builtin_ia32_haddps(x, y)
+# define hsub(x, y) __builtin_ia32_hsubps(x, y)
+# elif FLOAT_SIZE == 8
+# define addsub(x, y) __builtin_ia32_addsubpd(x, y)
+# define dup_lo(x) ({ \
+ double __attribute__((vector_size(16))) r_; \
+ asm ( "movddup %1,%0" : "=x" (r_) : "m" ((x)[0]) ); \
+ r_; \
+})
+# define hadd(x, y) __builtin_ia32_haddpd(x, y)
+# define hsub(x, y) __builtin_ia32_hsubpd(x, y)
+# endif
+#endif
+#if VEC_SIZE == 16 && defined(__SSSE3__)
+# if INT_SIZE == 1
+# define abs(x) ((vec_t)__builtin_ia32_pabsb128((vqi_t)(x)))
+# elif INT_SIZE == 2
+# define abs(x) __builtin_ia32_pabsw128(x)
+# elif INT_SIZE == 4
+# define abs(x) __builtin_ia32_pabsd128(x)
+# endif
+# if INT_SIZE == 1 || UINT_SIZE == 1
+# define copysignz(x, y) ((vec_t)__builtin_ia32_psignb128((vqi_t)(x), (vqi_t)(y)))
+# define swap(x) ((vec_t)__builtin_ia32_pshufb128((vqi_t)(x), (vqi_t)(inv - 1)))
+# define rotr(x, n) ((vec_t)__builtin_ia32_palignr128((vdi_t)(x), (vdi_t)(x), (n) * 8))
+# elif INT_SIZE == 2 || UINT_SIZE == 2
+# define copysignz(x, y) ((vec_t)__builtin_ia32_psignw128((vhi_t)(x), (vhi_t)(y)))
+# define hadd(x, y) ((vec_t)__builtin_ia32_phaddw128((vhi_t)(x), (vhi_t)(y)))
+# define hsub(x, y) ((vec_t)__builtin_ia32_phsubw128((vhi_t)(x), (vhi_t)(y)))
+# define rotr(x, n) ((vec_t)__builtin_ia32_palignr128((vdi_t)(x), (vdi_t)(x), (n) * 16))
+# elif INT_SIZE == 4 || UINT_SIZE == 4
+# define copysignz(x, y) ((vec_t)__builtin_ia32_psignd128((vsi_t)(x), (vsi_t)(y)))
+# define hadd(x, y) ((vec_t)__builtin_ia32_phaddd128((vsi_t)(x), (vsi_t)(y)))
+# define hsub(x, y) ((vec_t)__builtin_ia32_phsubd128((vsi_t)(x), (vsi_t)(y)))
+# define rotr(x, n) ((vec_t)__builtin_ia32_palignr128((vdi_t)(x), (vdi_t)(x), (n) * 32))
+# elif INT_SIZE == 8 || UINT_SIZE == 8
+# define rotr(x, n) ((vec_t)__builtin_ia32_palignr128((vdi_t)(x), (vdi_t)(x), (n) * 64))
+# endif
+#endif
+#if VEC_SIZE == 16 && defined(__SSE4_1__)
+# if INT_SIZE == 1
+# define max(x, y) ((vec_t)__builtin_ia32_pmaxsb128((vqi_t)(x), (vqi_t)(y)))
+# define min(x, y) ((vec_t)__builtin_ia32_pminsb128((vqi_t)(x), (vqi_t)(y)))
+# define widen1(x) ((vec_t)__builtin_ia32_pmovsxbw128((vqi_t)(x)))
+# define widen2(x) ((vec_t)__builtin_ia32_pmovsxbd128((vqi_t)(x)))
+# define widen3(x) ((vec_t)__builtin_ia32_pmovsxbq128((vqi_t)(x)))
+# elif INT_SIZE == 2
+# define widen1(x) ((vec_t)__builtin_ia32_pmovsxwd128(x))
+# define widen2(x) ((vec_t)__builtin_ia32_pmovsxwq128(x))
+# elif INT_SIZE == 4
+# define max(x, y) __builtin_ia32_pmaxsd128(x, y)
+# define min(x, y) __builtin_ia32_pminsd128(x, y)
+# define mul_full(x, y) ((vec_t)__builtin_ia32_pmuldq128(x, y))
+# define widen1(x) ((vec_t)__builtin_ia32_pmovsxdq128(x))
+# elif UINT_SIZE == 1
+# define widen1(x) ((vec_t)__builtin_ia32_pmovzxbw128((vqi_t)(x)))
+# define widen2(x) ((vec_t)__builtin_ia32_pmovzxbd128((vqi_t)(x)))
+# define widen3(x) ((vec_t)__builtin_ia32_pmovzxbq128((vqi_t)(x)))
+# elif UINT_SIZE == 2
+# define max(x, y) ((vec_t)__builtin_ia32_pmaxuw128((vhi_t)(x), (vhi_t)(y)))
+# define min(x, y) ((vec_t)__builtin_ia32_pminuw128((vhi_t)(x), (vhi_t)(y)))
+# define widen1(x) ((vec_t)__builtin_ia32_pmovzxwd128((vhi_t)(x)))
+# define widen2(x) ((vec_t)__builtin_ia32_pmovzxwq128((vhi_t)(x)))
+# elif UINT_SIZE == 4
+# define max(x, y) ((vec_t)__builtin_ia32_pmaxud128((vsi_t)(x), (vsi_t)(y)))
+# define min(x, y) ((vec_t)__builtin_ia32_pminud128((vsi_t)(x), (vsi_t)(y)))
+# define widen1(x) ((vec_t)__builtin_ia32_pmovzxdq128((vsi_t)(x)))
+# endif
+# undef select
+# if defined(INT_SIZE) || defined(UINT_SIZE)
+# define select(d, x, y, m) \
+ (*(d) = (vec_t)__builtin_ia32_pblendvb128((vqi_t)(y), (vqi_t)(x), (vqi_t)(m)))
+# elif FLOAT_SIZE == 4
+# define dot_product(x, y) __builtin_ia32_dpps(x, y, 0b11110001)
+# define select(d, x, y, m) (*(d) = __builtin_ia32_blendvps(y, x, m))
+# define trunc(x) __builtin_ia32_roundps(x, 0b1011)
+# elif FLOAT_SIZE == 8
+# define dot_product(x, y) __builtin_ia32_dppd(x, y, 0b00110001)
+# define select(d, x, y, m) (*(d) = __builtin_ia32_blendvpd(y, x, m))
+# define trunc(x) __builtin_ia32_roundpd(x, 0b1011)
+# endif
+# if INT_SIZE == 2 || UINT_SIZE == 2
+# define mix(x, y) ((vec_t)__builtin_ia32_pblendw128((vhi_t)(x), (vhi_t)(y), 0b10101010))
+# elif INT_SIZE == 4 || UINT_SIZE == 4
+# define mix(x, y) ((vec_t)__builtin_ia32_pblendw128((vhi_t)(x), (vhi_t)(y), 0b11001100))
+# elif INT_SIZE == 8 || UINT_SIZE == 8
+# define mix(x, y) ((vec_t)__builtin_ia32_pblendw128((vhi_t)(x), (vhi_t)(y), 0b11110000))
+# elif FLOAT_SIZE == 4
+# define mix(x, y) __builtin_ia32_blendps(x, y, 0b1010)
+# elif FLOAT_SIZE == 8
+# define mix(x, y) __builtin_ia32_blendpd(x, y, 0b10)
+# endif
+#endif
#if VEC_SIZE == FLOAT_SIZE
# define max(x, y) ((vec_t){({ typeof(x[0]) x_ = (x)[0], y_ = (y)[0]; x_ > y_ ? x_ : y_; })})
# define min(x, y) ((vec_t){({ typeof(x[0]) x_ = (x)[0], y_ = (y)[0]; x_ < y_ ? x_ : y_; })})
+# ifdef __SSE4_1__
+# if FLOAT_SIZE == 4
+# define trunc(x) ({ \
+ float __attribute__((vector_size(16))) r_; \
+ asm ( "roundss $0b1011,%1,%0" : "=x" (r_) : "m" (x) ); \
+ (vec_t){ r_[0] }; \
+})
+# elif FLOAT_SIZE == 8
+# define trunc(x) ({ \
+ double __attribute__((vector_size(16))) r_; \
+ asm ( "roundsd $0b1011,%1,%0" : "=x" (r_) : "m" (x) ); \
+ (vec_t){ r_[0] }; \
+})
+# endif
+# endif
#endif
/*
if ( !to_bool(sqrt(x) == src) ) return __LINE__;
# endif
+# ifdef trunc
+ x = 1 / src;
+ y = (vec_t){ 1 };
+ touch(x);
+ z = trunc(x);
+ if ( !to_bool(y == z) ) return __LINE__;
+# endif
+
#else
# if ELEM_SIZE > 1
# endif
#endif
+#ifdef abs
+ x = src * alt;
+ touch(x);
+ if ( !to_bool(abs(x) == src) ) return __LINE__;
+#endif
+
+#ifdef copysignz
+ touch(alt);
+ if ( !to_bool(copysignz((vec_t){} + 1, alt) == alt) ) return __LINE__;
+#endif
+
#ifdef swap
touch(src);
if ( !to_bool(swap(src) == inv) ) return __LINE__;
if ( !to_bool(z == ELEM_COUNT / 2) ) return __LINE__;
#endif
+#if defined(INT_SIZE) && defined(widen1) && defined(interleave_lo)
+
+ x = src * alt;
+ y = interleave_lo(x, alt < 0);
+ touch(x);
+ z = widen1(x);
+ touch(x);
+ if ( !to_bool(z == y) ) return __LINE__;
+
+# ifdef widen2
+ y = interleave_lo(alt < 0, alt < 0);
+ y = interleave_lo(z, y);
+ touch(x);
+ z = widen2(x);
+ touch(x);
+ if ( !to_bool(z == y) ) return __LINE__;
+
+# ifdef widen3
+ y = interleave_lo(alt < 0, alt < 0);
+ y = interleave_lo(y, y);
+ y = interleave_lo(z, y);
+ touch(x);
+ z = widen3(x);
+ touch(x);
+ if ( !to_bool(z == y) ) return __LINE__;
+# endif
+# endif
+
+#endif
+
+#if defined(UINT_SIZE) && defined(interleave_lo)
+
+ y = interleave_lo(src, (vec_t){});
+ z = interleave_lo(y, (vec_t){});
+
+# ifdef widen1
+ touch(src);
+ x = widen1(src);
+ touch(src);
+ if ( !to_bool(x == y) ) return __LINE__;
+# endif
+
+# ifdef widen2
+ touch(src);
+ x = widen2(src);
+ touch(src);
+ if ( !to_bool(x == z) ) return __LINE__;
+# endif
+
+# ifdef widen3
+ touch(src);
+ x = widen3(src);
+ touch(src);
+ if ( !to_bool(x == interleave_lo(z, (vec_t){})) ) return __LINE__;
+# endif
+
+#endif
+
+#ifdef dup_lo
+ touch(src);
+ x = dup_lo(src);
+ touch(src);
+ if ( !to_bool(x - src == (alt - 1) / 2) ) return __LINE__;
+#endif
+
+#ifdef dup_hi
+ touch(src);
+ x = dup_hi(src);
+ touch(src);
+ if ( !to_bool(x - src == (alt + 1) / 2) ) return __LINE__;
+#endif
+
+ for ( i = 0; i < ELEM_COUNT; ++i )
+ y[i] = (i & 1 ? inv : src)[i];
+
#ifdef select
# ifdef UINT_SIZE
select(&z, src, inv, alt);
# else
select(&z, src, inv, alt > 0);
# endif
- for ( i = 0; i < ELEM_COUNT; ++i )
- y[i] = (i & 1 ? inv : src)[i];
if ( !to_bool(z == y) ) return __LINE__;
#endif
+#ifdef mix
+ touch(src);
+ touch(inv);
+ x = mix(src, inv);
+ if ( !to_bool(x == y) ) return __LINE__;
+
+# ifdef addsub
+ touch(src);
+ touch(inv);
+ x = addsub(src, inv);
+ touch(src);
+ touch(inv);
+ y = mix(src - inv, src + inv);
+ if ( !to_bool(x == y) ) return __LINE__;
+# endif
+#endif
+
+#ifdef rotr
+ x = rotr(src, 1);
+ y = (src & (ELEM_COUNT - 1)) + 1;
+ if ( !to_bool(x == y) ) return __LINE__;
+#endif
+
+#ifdef dot_product
+ touch(src);
+ touch(inv);
+ x = dot_product(src, inv);
+ if ( !to_bool(x == (vec_t){ (ELEM_COUNT * (ELEM_COUNT + 1) *
+ (ELEM_COUNT + 2)) / 6 }) ) return __LINE__;
+#endif
+
+#ifdef hadd
+ x = src;
+ for ( i = ELEM_COUNT; i >>= 1; )
+ {
+ touch(x);
+ x = hadd((vec_t){}, x);
+ }
+ if ( x[ELEM_COUNT - 1] != (ELEM_COUNT * (ELEM_COUNT + 1)) / 2 ) return __LINE__;
+
+# ifdef hsub
+ touch(src);
+ touch(inv);
+ x = hsub(src, inv);
+ for ( i = ELEM_COUNT; i >>= 1; )
+ x = hadd(x, (vec_t){});
+ if ( !to_bool(x == 0) ) return __LINE__;
+# endif
+#endif
+
+
return 0;
}
return cpu_has_sse2;
}
+static bool simd_check_sse4(void)
+{
+ return cpu_has_sse4_2;
+}
+
static bool simd_check_avx(void)
{
return cpu_has_avx;
}
#define simd_check_sse_avx simd_check_avx
#define simd_check_sse2_avx simd_check_avx
+#define simd_check_sse4_avx simd_check_avx
static void simd_set_regs(struct cpu_user_regs *regs)
{
SIMD(SSE2 packed u32, sse2, 16u4),
SIMD(SSE2 packed s64, sse2, 16i8),
SIMD(SSE2 packed u64, sse2, 16u8),
+ SIMD(SSE4 scalar single, sse4, f4),
+ SIMD(SSE4 packed single, sse4, 16f4),
+ SIMD(SSE4 scalar double, sse4, f8),
+ SIMD(SSE4 packed double, sse4, 16f8),
+ SIMD(SSE4 packed s8, sse4, 16i1),
+ SIMD(SSE4 packed u8, sse4, 16u1),
+ SIMD(SSE4 packed s16, sse4, 16i2),
+ SIMD(SSE4 packed u16, sse4, 16u2),
+ SIMD(SSE4 packed s32, sse4, 16i4),
+ SIMD(SSE4 packed u32, sse4, 16u4),
+ SIMD(SSE4 packed s64, sse4, 16i8),
+ SIMD(SSE4 packed u64, sse4, 16u8),
SIMD(SSE/AVX scalar single, sse_avx, f4),
SIMD(SSE/AVX packed single, sse_avx, 16f4),
SIMD(SSE2/AVX scalar single, sse2_avx, f4),
SIMD(SSE2/AVX packed u32, sse2_avx, 16u4),
SIMD(SSE2/AVX packed s64, sse2_avx, 16i8),
SIMD(SSE2/AVX packed u64, sse2_avx, 16u8),
+ SIMD(SSE4/AVX scalar single, sse4_avx, f4),
+ SIMD(SSE4/AVX packed single, sse4_avx, 16f4),
+ SIMD(SSE4/AVX scalar double, sse4_avx, f8),
+ SIMD(SSE4/AVX packed double, sse4_avx, 16f8),
+ SIMD(SSE4/AVX packed s8, sse4_avx, 16i1),
+ SIMD(SSE4/AVX packed u8, sse4_avx, 16u1),
+ SIMD(SSE4/AVX packed s16, sse4_avx, 16i2),
+ SIMD(SSE4/AVX packed u16, sse4_avx, 16u2),
+ SIMD(SSE4/AVX packed s32, sse4_avx, 16i4),
+ SIMD(SSE4/AVX packed u32, sse4_avx, 16u4),
+ SIMD(SSE4/AVX packed s64, sse4_avx, 16i8),
+ SIMD(SSE4/AVX packed u64, sse4_avx, 16u8),
#undef SIMD_
#undef SIMD
};
else
printf("skipped\n");
+ printf("%-40s", "Testing extrq $4,$56,%xmm2...");
+ if ( stack_exec && cpu_has_sse4a )
+ {
+ decl_insn(extrq_imm);
+
+ res[0] = 0x44332211;
+ res[1] = 0x88776655;
+ asm volatile ( "movq %0, %%xmm2\n"
+ put_insn(extrq_imm, "extrq $4, $56, %%xmm2")
+ :: "m" (res[0]) : "memory" );
+
+ set_insn(extrq_imm);
+ rc = x86_emulate(&ctxt, &emulops);
+ asm ( "movq %%xmm2, %0" : "=m" (res[4]) :: "memory" );
+ if ( rc != X86EMUL_OKAY || !check_eip(extrq_imm) ||
+ res[4] != 0x54433221 || res[5] != 0x877665 )
+ goto fail;
+ printf("okay\n");
+ }
+ else
+ printf("skipped\n");
+
+ printf("%-40s", "Testing extrq %xmm3,%xmm2...");
+ if ( stack_exec && cpu_has_sse4a )
+ {
+ decl_insn(extrq_reg);
+
+ res[4] = 56 + (4 << 8);
+ res[5] = 0;
+ asm volatile ( "movq %0, %%xmm2\n"
+ "movq %1, %%xmm3\n"
+ put_insn(extrq_reg, "extrq %%xmm3, %%xmm2")
+ :: "m" (res[0]), "m" (res[4]) : "memory" );
+
+ set_insn(extrq_reg);
+ rc = x86_emulate(&ctxt, &emulops);
+ asm ( "movq %%xmm2, %0" : "=m" (res[4]) :: "memory" );
+ if ( rc != X86EMUL_OKAY || !check_eip(extrq_reg) ||
+ res[4] != 0x54433221 || res[5] != 0x877665 )
+ goto fail;
+ printf("okay\n");
+ }
+ else
+ printf("skipped\n");
+
+ printf("%-40s", "Testing insertq $12,$40,%xmm2,%xmm3...");
+ if ( stack_exec && cpu_has_sse4a )
+ {
+ decl_insn(insertq_imm);
+
+ res[4] = 0xccbbaa99;
+ res[5] = 0x00ffeedd;
+ asm volatile ( "movq %1, %%xmm2\n"
+ "movq %0, %%xmm3\n"
+ put_insn(insertq_imm, "insertq $12, $40, %%xmm2, %%xmm3")
+ :: "m" (res[0]), "m" (res[4]) : "memory" );
+
+ set_insn(insertq_imm);
+ rc = x86_emulate(&ctxt, &emulops);
+ asm ( "movq %%xmm3, %0" : "=m" (res[4]) :: "memory" );
+ if ( rc != X86EMUL_OKAY || !check_eip(insertq_imm) ||
+ res[4] != 0xbaa99211 || res[5] != 0x887ddccb )
+ goto fail;
+ printf("okay\n");
+ }
+ else
+ printf("skipped\n");
+
+ printf("%-40s", "Testing insertq %xmm2,%xmm3...");
+ if ( stack_exec && cpu_has_sse4a )
+ {
+ decl_insn(insertq_reg);
+
+ res[4] = 0xccbbaa99;
+ res[5] = 0x00ffeedd;
+ res[6] = 40 + (12 << 8);
+ res[7] = 0;
+ asm volatile ( "movdqu %1, %%xmm2\n"
+ "movq %0, %%xmm3\n"
+ put_insn(insertq_reg, "insertq %%xmm2, %%xmm3")
+ :: "m" (res[0]), "m" (res[4]) : "memory" );
+
+ set_insn(insertq_reg);
+ rc = x86_emulate(&ctxt, &emulops);
+ asm ( "movq %%xmm3, %0" : "=m" (res[4]) :: "memory" );
+ if ( rc != X86EMUL_OKAY || !check_eip(insertq_reg) ||
+ res[4] != 0xbaa99211 || res[5] != 0x887ddccb )
+ goto fail;
+ printf("okay\n");
+ }
+ else
+ printf("skipped\n");
+
printf("%-40s", "Testing stmxcsr (%edx)...");
if ( cpu_has_sse )
{