Implementations based off headers in GCC release 12.3.0.
Signed-off-by: Andrei Tatar <andrei@unikraft.io>
Reviewed-by: Maria Sfiraiala <maria.sfiraiala@gmail.com>
Reviewed-by: Radu Nichita <radunichita99@gmail.com>
Approved-by: Razvan Deaconescu <razvand@unikraft.io>
Tested-by: Unikraft CI <monkey@unikraft.io>
GitHub-Closes: #3
#define __DISABLE_AVX512BF16__
#endif /* __AVX512BF16__ */
+#if (__GNUC__ < 13)
+/* Internal data types for implementing the intrinsics. */
+typedef short __v32bh __attribute__ ((__vector_size__ (64)));
+
+/* The Intel API is flexible enough that we must allow aliasing with other
+ vector types, and their scalar components. */
+typedef short __m512bh __attribute__ ((__vector_size__ (64), __may_alias__));
+
+/* Convert One BF16 Data to One Single Float Data. */
+extern __inline float
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsbh_ss (__bfloat16 __A)
+{
+ union{ float __a; unsigned int __b;} __tmp;
+ __tmp.__b = ((unsigned int)(__A)) << 16;
+ return __tmp.__a;
+}
+#else
/* Internal data types for implementing the intrinsics. */
typedef __bf16 __v32bf __attribute__ ((__vector_size__ (64)));
{
return __builtin_ia32_cvtbf2sf (__A);
}
+#endif
/* vcvtne2ps2bf16 */
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cvtne2ps_pbh (__m512 __A, __m512 __B)
{
+#if (__GNUC__ < 13)
+ return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32hi(__A, __B);
+#else
return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32bf(__A, __B);
+#endif
}
extern __inline __m512bh
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cvtne2ps_pbh (__m512bh __A, __mmask32 __B, __m512 __C, __m512 __D)
{
+#if (__GNUC__ < 13)
+ return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32hi_mask(__C, __D, __A, __B);
+#else
return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32bf_mask(__C, __D, __A, __B);
+#endif
}
extern __inline __m512bh
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_cvtne2ps_pbh (__mmask32 __A, __m512 __B, __m512 __C)
{
+#if (__GNUC__ < 13)
+ return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32hi_maskz(__B, __C, __A);
+#else
return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32bf_maskz(__B, __C, __A);
+#endif
}
/* vcvtneps2bf16 */
#define __DISABLE_AVX512BF16VL__
#endif /* __AVX512BF16__ */
+#if (__GNUC__ < 13)
+/* Internal data types for implementing the intrinsics. */
+typedef short __v16bh __attribute__ ((__vector_size__ (32)));
+typedef short __v8bh __attribute__ ((__vector_size__ (16)));
+
+/* The Intel API is flexible enough that we must allow aliasing with other
+ vector types, and their scalar components. */
+typedef short __m256bh __attribute__ ((__vector_size__ (32), __may_alias__));
+typedef short __m128bh __attribute__ ((__vector_size__ (16), __may_alias__));
+
+typedef unsigned short __bfloat16;
+#else
/* Internal data types for implementing the intrinsics. */
typedef __bf16 __v16bf __attribute__ ((__vector_size__ (32)));
typedef __bf16 __v8bf __attribute__ ((__vector_size__ (16)));
typedef __bf16 __m128bh __attribute__ ((__vector_size__ (16), __may_alias__));
typedef __bf16 __bfloat16;
+#endif
#define _mm256_cvtneps_pbh(A) \
(__m128bh) __builtin_ia32_cvtneps2bf16_v8sf (A)
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cvtne2ps_pbh (__m256 __A, __m256 __B)
{
+#if (__GNUC__ < 13)
+ return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16hi(__A, __B);
+#else
return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16bf(__A, __B);
+#endif
}
extern __inline __m256bh
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_cvtne2ps_pbh (__m256bh __A, __mmask16 __B, __m256 __C, __m256 __D)
{
+#if (__GNUC__ < 13)
+ return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16hi_mask(__C, __D, __A, __B);
+#else
return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16bf_mask(__C, __D, __A, __B);
+#endif
}
extern __inline __m256bh
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_cvtne2ps_pbh (__mmask16 __A, __m256 __B, __m256 __C)
{
+#if (__GNUC__ < 13)
+ return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16hi_maskz(__B, __C, __A);
+#else
return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16bf_maskz(__B, __C, __A);
+#endif
}
extern __inline __m128bh
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtne2ps_pbh (__m128 __A, __m128 __B)
{
+#if (__GNUC__ < 13)
+ return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8hi(__A, __B);
+#else
return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8bf(__A, __B);
+#endif
}
extern __inline __m128bh
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_cvtne2ps_pbh (__m128bh __A, __mmask8 __B, __m128 __C, __m128 __D)
{
+#if (__GNUC__ < 13)
+ return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8hi_mask(__C, __D, __A, __B);
+#else
return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8bf_mask(__C, __D, __A, __B);
+#endif
}
extern __inline __m128bh
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_cvtne2ps_pbh (__mmask8 __A, __m128 __B, __m128 __C)
{
+#if (__GNUC__ < 13)
+ return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8hi_maskz(__B, __C, __A);
+#else
return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8bf_maskz(__B, __C, __A);
+#endif
}
/* vcvtneps2bf16 */
return (__m128)__builtin_ia32_dpbf16ps_v4sf_maskz(__B, __C, __D, __A);
}
-extern __inline __bf16
+extern __inline __bfloat16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtness_sbh (float __A)
{
__v4sf __V = {__A, 0, 0, 0};
+#if (__GNUC__ < 13)
+ __v8hi __R = __builtin_ia32_cvtneps2bf16_v4sf_mask ((__v4sf)__V,
+ (__v8hi)_mm_undefined_si128 (), (__mmask8)-1);
+#else
__v8bf __R = __builtin_ia32_cvtneps2bf16_v4sf_mask ((__v4sf)__V,
(__v8bf)_mm_undefined_si128 (), (__mmask8)-1);
+#endif
return __R[0];
}
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_setzero_ph (void)
{
+#if (__GNUC__ < 13)
+ return _mm_set1_ph (0.0f);
+#else
return _mm_set1_ph (0.0f16);
+#endif
}
extern __inline __m256h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_setzero_ph (void)
{
+#if (__GNUC__ < 13)
+ return _mm256_set1_ph (0.0f);
+#else
return _mm256_set1_ph (0.0f16);
+#endif
}
extern __inline __m512h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_setzero_ph (void)
{
+#if (__GNUC__ < 13)
+ return _mm512_set1_ph (0.0f);
+#else
return _mm512_set1_ph (0.0f16);
+#endif
}
extern __inline __m128h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_sh (_Float16 __F)
{
+#if (__GNUC__ < 13)
+ return _mm_set_ph (0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, __F);
+#else
return _mm_set_ph (0.0f16, 0.0f16, 0.0f16, 0.0f16, 0.0f16, 0.0f16, 0.0f16,
__F);
+#endif
}
/* Create a vector with element 0 as *P and the rest zero. */
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_load_sh (void const *__P)
{
+#if (__GNUC__ < 13)
+ return _mm_set_ph (0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ *(_Float16 const *) __P);
+#else
return _mm_set_ph (0.0f16, 0.0f16, 0.0f16, 0.0f16, 0.0f16, 0.0f16, 0.0f16,
*(_Float16 const *) __P);
+#endif
}
extern __inline __m512h
#include <avxvnniintrin.h>
+#if (__GNUC__ >= 13)
#include <avxifmaintrin.h>
#include <avxvnniint8intrin.h>
+#endif
#include <avx2intrin.h>
#include <avx512bf16intrin.h>
+#if (__GNUC__ >= 13)
#include <avxneconvertintrin.h>
#endif
+#endif
#include <amxtileintrin.h>
#include <amxbf16intrin.h>
+#if (__GNUC__ >= 13)
#include <amxcomplexintrin.h>
+#endif
#include <prfchwintrin.h>
#include <keylockerintrin.h>
+#if (__GNUC__ >= 13)
#include <amxfp16intrin.h>
+#endif
#endif /* _IMMINTRIN_H_INCLUDED */
#include <clzerointrin.h>
+#if (__GNUC__ >= 13)
#include <cmpccxaddintrin.h>
+#endif
#include <enqcmdintrin.h>
#include <pkuintrin.h>
+#if (__GNUC__ >= 13)
#include <prfchiintrin.h>
#include <raointintrin.h>
+#endif
#include <rdseedintrin.h>
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_prefetch (const void *__P, enum _mm_hint __I)
{
+#if (__GNUC__ < 13)
+ __builtin_prefetch (__P, (__I & 0x4) >> 2, __I & 0x3);
+#else
__builtin_ia32_prefetch (__P, (__I & 0x4) >> 2,
__I & 0x3, (__I & 0x10) >> 4);
+#endif
}
#else
+#if (__GNUC__ < 13)
+#define _mm_prefetch(P, I) \
+ __builtin_prefetch ((P), ((I & 0x4) >> 2), (I & 0x3))
+#else
#define _mm_prefetch(P, I) \
__builtin_ia32_prefetch ((P), ((I) & 0x4) >> 2, ((I) & 0x3), ((I) & 0x10) >> 4)
#endif
+#endif
#ifndef __SSE__
#pragma GCC push_options