Implementations based off headers in LLVM 15.0.0.
Signed-off-by: Andrei Tatar <andrei@unikraft.io>
Reviewed-by: Maria Sfiraiala <maria.sfiraiala@gmail.com>
Reviewed-by: Radu Nichita <radunichita99@gmail.com>
Approved-by: Razvan Deaconescu <razvand@unikraft.io>
Tested-by: Unikraft CI <monkey@unikraft.io>
GitHub-Closes: #3
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_adds_epi8(__m256i __a, __m256i __b)
{
+#if (__clang_major__ > 14)
+ return (__m256i)__builtin_elementwise_add_sat((__v32qs)__a, (__v32qs)__b);
+#else
return (__m256i)__builtin_ia32_paddsb256((__v32qi)__a, (__v32qi)__b);
+#endif
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_adds_epi16(__m256i __a, __m256i __b)
{
+#if (__clang_major__ > 14)
+ return (__m256i)__builtin_elementwise_add_sat((__v16hi)__a, (__v16hi)__b);
+#else
return (__m256i)__builtin_ia32_paddsw256((__v16hi)__a, (__v16hi)__b);
+#endif
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_adds_epu8(__m256i __a, __m256i __b)
{
+#if (__clang_major__ > 14)
+ return (__m256i)__builtin_elementwise_add_sat((__v32qu)__a, (__v32qu)__b);
+#else
return (__m256i)__builtin_ia32_paddusb256((__v32qi)__a, (__v32qi)__b);
+#endif
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_adds_epu16(__m256i __a, __m256i __b)
{
+#if (__clang_major__ > 14)
+ return (__m256i)__builtin_elementwise_add_sat((__v16hu)__a, (__v16hu)__b);
+#else
return (__m256i)__builtin_ia32_paddusw256((__v16hi)__a, (__v16hi)__b);
+#endif
}
#define _mm256_alignr_epi8(a, b, n) \
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_subs_epi8(__m256i __a, __m256i __b)
{
+#if (__clang_major__ > 14)
+ return (__m256i)__builtin_elementwise_sub_sat((__v32qs)__a, (__v32qs)__b);
+#else
return (__m256i)__builtin_ia32_psubsb256((__v32qi)__a, (__v32qi)__b);
+#endif
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_subs_epi16(__m256i __a, __m256i __b)
{
+#if (__clang_major__ > 14)
+ return (__m256i)__builtin_elementwise_sub_sat((__v16hi)__a, (__v16hi)__b);
+#else
return (__m256i)__builtin_ia32_psubsw256((__v16hi)__a, (__v16hi)__b);
+#endif
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_subs_epu8(__m256i __a, __m256i __b)
{
+#if (__clang_major__ > 14)
+ return (__m256i)__builtin_elementwise_sub_sat((__v32qu)__a, (__v32qu)__b);
+#else
return (__m256i)__builtin_ia32_psubusb256((__v32qi)__a, (__v32qi)__b);
+#endif
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_subs_epu16(__m256i __a, __m256i __b)
{
+#if (__clang_major__ > 14)
+ return (__m256i)__builtin_elementwise_sub_sat((__v16hu)__a, (__v16hu)__b);
+#else
return (__m256i)__builtin_ia32_psubusw256((__v16hi)__a, (__v16hi)__b);
+#endif
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_adds_epi8 (__m512i __A, __m512i __B)
{
+#if (__clang_major__ > 14)
+ return (__m512i)__builtin_elementwise_add_sat((__v64qs)__A, (__v64qs)__B);
+#else
return (__m512i)__builtin_ia32_paddsb512((__v64qi)__A, (__v64qi)__B);
+#endif
}
static __inline__ __m512i __DEFAULT_FN_ATTRS512
static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_adds_epi16 (__m512i __A, __m512i __B)
{
+#if (__clang_major__ > 14)
+ return (__m512i)__builtin_elementwise_add_sat((__v32hi)__A, (__v32hi)__B);
+#else
return (__m512i)__builtin_ia32_paddsw512((__v32hi)__A, (__v32hi)__B);
+#endif
}
static __inline__ __m512i __DEFAULT_FN_ATTRS512
static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_adds_epu8 (__m512i __A, __m512i __B)
{
+#if (__clang_major__ > 14)
+ return (__m512i)__builtin_elementwise_add_sat((__v64qu) __A, (__v64qu) __B);
+#else
return (__m512i)__builtin_ia32_paddusb512((__v64qi) __A, (__v64qi) __B);
+#endif
}
static __inline__ __m512i __DEFAULT_FN_ATTRS512
static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_adds_epu16 (__m512i __A, __m512i __B)
{
+#if (__clang_major__ > 14)
+ return (__m512i)__builtin_elementwise_add_sat((__v32hu) __A, (__v32hu) __B);
+#else
return (__m512i)__builtin_ia32_paddusw512((__v32hi) __A, (__v32hi) __B);
+#endif
}
static __inline__ __m512i __DEFAULT_FN_ATTRS512
static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_subs_epi8 (__m512i __A, __m512i __B)
{
+#if (__clang_major__ > 14)
+ return (__m512i)__builtin_elementwise_sub_sat((__v64qs)__A, (__v64qs)__B);
+#else
return (__m512i)__builtin_ia32_psubsb512((__v64qi)__A, (__v64qi)__B);
+#endif
}
static __inline__ __m512i __DEFAULT_FN_ATTRS512
static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_subs_epi16 (__m512i __A, __m512i __B)
{
+#if (__clang_major__ > 14)
+ return (__m512i)__builtin_elementwise_sub_sat((__v32hi)__A, (__v32hi)__B);
+#else
return (__m512i)__builtin_ia32_psubsw512((__v32hi)__A, (__v32hi)__B);
+#endif
}
static __inline__ __m512i __DEFAULT_FN_ATTRS512
static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_subs_epu8 (__m512i __A, __m512i __B)
{
+#if (__clang_major__ > 14)
+ return (__m512i)__builtin_elementwise_sub_sat((__v64qu) __A, (__v64qu) __B);
+#else
return (__m512i)__builtin_ia32_psubusb512((__v64qi) __A, (__v64qi) __B);
+#endif
}
static __inline__ __m512i __DEFAULT_FN_ATTRS512
static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_subs_epu16 (__m512i __A, __m512i __B)
{
+#if (__clang_major__ > 14)
+ return (__m512i)__builtin_elementwise_sub_sat((__v32hu) __A, (__v32hu) __B);
+#else
return (__m512i)__builtin_ia32_psubusw512((__v32hi) __A, (__v32hi) __B);
+#endif
}
static __inline__ __m512i __DEFAULT_FN_ATTRS512
static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_slli_epi16(__m512i __A, unsigned int __B)
{
+#if (__clang_major__ > 14)
+ return (__m512i)__builtin_ia32_psllwi512((__v32hi)__A, (int)__B);
+#else
return (__m512i)__builtin_ia32_psllwi512((__v32hi)__A, __B);
+#endif
}
static __inline__ __m512i __DEFAULT_FN_ATTRS512
static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_srai_epi16(__m512i __A, unsigned int __B)
{
+#if (__clang_major__ > 14)
+ return (__m512i)__builtin_ia32_psrawi512((__v32hi)__A, (int)__B);
+#else
return (__m512i)__builtin_ia32_psrawi512((__v32hi)__A, __B);
+#endif
}
static __inline__ __m512i __DEFAULT_FN_ATTRS512
*/
static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_add_epi64(__m512i __W) {
+#if (__clang_major__ > 14)
+ return __builtin_reduce_add((__v8di)__W);
+#else
return __builtin_ia32_reduce_add_q512(__W);
+#endif
}
static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_mul_epi64(__m512i __W) {
+#if (__clang_major__ > 14)
+ return __builtin_reduce_mul((__v8di)__W);
+#else
return __builtin_ia32_reduce_mul_q512(__W);
+#endif
}
static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_and_epi64(__m512i __W) {
static __inline__ long long __DEFAULT_FN_ATTRS512
_mm512_mask_reduce_add_epi64(__mmask8 __M, __m512i __W) {
__W = _mm512_maskz_mov_epi64(__M, __W);
+#if (__clang_major__ > 14)
+ return __builtin_reduce_add((__v8di)__W);
+#else
return __builtin_ia32_reduce_add_q512(__W);
+#endif
}
static __inline__ long long __DEFAULT_FN_ATTRS512
_mm512_mask_reduce_mul_epi64(__mmask8 __M, __m512i __W) {
__W = _mm512_mask_mov_epi64(_mm512_set1_epi64(1), __M, __W);
+#if (__clang_major__ > 14)
+ return __builtin_reduce_mul((__v8di)__W);
+#else
return __builtin_ia32_reduce_mul_q512(__W);
+#endif
}
static __inline__ long long __DEFAULT_FN_ATTRS512
static __inline__ int __DEFAULT_FN_ATTRS512
_mm512_reduce_add_epi32(__m512i __W) {
+#if (__clang_major__ > 14)
+ return __builtin_reduce_add((__v16si)__W);
+#else
return __builtin_ia32_reduce_add_d512((__v16si)__W);
+#endif
}
static __inline__ int __DEFAULT_FN_ATTRS512
_mm512_reduce_mul_epi32(__m512i __W) {
+#if (__clang_major__ > 14)
+ return __builtin_reduce_mul((__v16si)__W);
+#else
return __builtin_ia32_reduce_mul_d512((__v16si)__W);
+#endif
}
static __inline__ int __DEFAULT_FN_ATTRS512
static __inline__ int __DEFAULT_FN_ATTRS512
_mm512_mask_reduce_add_epi32( __mmask16 __M, __m512i __W) {
__W = _mm512_maskz_mov_epi32(__M, __W);
+#if (__clang_major__ > 14)
+ return __builtin_reduce_add((__v16si)__W);
+#else
return __builtin_ia32_reduce_add_d512((__v16si)__W);
+#endif
}
static __inline__ int __DEFAULT_FN_ATTRS512
_mm512_mask_reduce_mul_epi32( __mmask16 __M, __m512i __W) {
__W = _mm512_mask_mov_epi32(_mm512_set1_epi32(1), __M, __W);
+#if (__clang_major__ > 14)
+ return __builtin_reduce_mul((__v16si)__W);
+#else
return __builtin_ia32_reduce_mul_d512((__v16si)__W);
+#endif
}
static __inline__ int __DEFAULT_FN_ATTRS512
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_adds_epi8(__m128i __a, __m128i __b)
{
+#if (__clang_major__ > 14)
+ return (__m128i)__builtin_elementwise_add_sat((__v16qs)__a, (__v16qs)__b);
+#else
return (__m128i)__builtin_ia32_paddsb128((__v16qi)__a, (__v16qi)__b);
+#endif
}
/// Adds, with saturation, the corresponding elements of two 128-bit
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_adds_epi16(__m128i __a, __m128i __b)
{
+#if (__clang_major__ > 14)
+ return (__m128i)__builtin_elementwise_add_sat((__v8hi)__a, (__v8hi)__b);
+#else
return (__m128i)__builtin_ia32_paddsw128((__v8hi)__a, (__v8hi)__b);
+#endif
}
/// Adds, with saturation, the corresponding elements of two 128-bit
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_adds_epu8(__m128i __a, __m128i __b)
{
+#if (__clang_major__ > 14)
+ return (__m128i)__builtin_elementwise_add_sat((__v16qu)__a, (__v16qu)__b);
+#else
return (__m128i)__builtin_ia32_paddusb128((__v16qi)__a, (__v16qi)__b);
+#endif
}
/// Adds, with saturation, the corresponding elements of two 128-bit
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_adds_epu16(__m128i __a, __m128i __b)
{
+#if (__clang_major__ > 14)
+ return (__m128i)__builtin_elementwise_add_sat((__v8hu)__a, (__v8hu)__b);
+#else
return (__m128i)__builtin_ia32_paddusw128((__v8hi)__a, (__v8hi)__b);
+#endif
}
/// Computes the rounded averages of corresponding elements of two
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_subs_epi8(__m128i __a, __m128i __b)
{
+#if (__clang_major__ > 14)
+ return (__m128i)__builtin_elementwise_sub_sat((__v16qs)__a, (__v16qs)__b);
+#else
return (__m128i)__builtin_ia32_psubsb128((__v16qi)__a, (__v16qi)__b);
+#endif
}
/// Subtracts corresponding 16-bit signed integer values in the input and
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_subs_epi16(__m128i __a, __m128i __b)
{
+#if (__clang_major__ > 14)
+ return (__m128i)__builtin_elementwise_sub_sat((__v8hi)__a, (__v8hi)__b);
+#else
return (__m128i)__builtin_ia32_psubsw128((__v8hi)__a, (__v8hi)__b);
+#endif
}
/// Subtracts corresponding 8-bit unsigned integer values in the input
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_subs_epu8(__m128i __a, __m128i __b)
{
+#if (__clang_major__ > 14)
+ return (__m128i)__builtin_elementwise_sub_sat((__v16qu)__a, (__v16qu)__b);
+#else
return (__m128i)__builtin_ia32_psubusb128((__v16qi)__a, (__v16qi)__b);
+#endif
}
/// Subtracts corresponding 16-bit unsigned integer values in the input
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_subs_epu16(__m128i __a, __m128i __b)
{
+#if (__clang_major__ > 14)
+ return (__m128i)__builtin_elementwise_sub_sat((__v8hu)__a, (__v8hu)__b);
+#else
return (__m128i)__builtin_ia32_psubusw128((__v8hi)__a, (__v8hi)__b);
+#endif
}
/// Performs a bitwise AND of two 128-bit integer vectors.