From: Andrei Tatar Date: Mon, 17 Jul 2023 13:39:32 +0000 (+0200) Subject: Remove GCC compatibility adaptations X-Git-Tag: RELEASE-0.14.0~13 X-Git-Url: http://xenbits.xensource.com/gitweb?a=commitdiff_plain;h=58bda76e9d89649e9f2b807a396a09be5a062196;p=unikraft%2Flibs%2Fintel-intrinsics.git Remove GCC compatibility adaptations This change removes code changes meant for compatibility with GCC, reverting the headers to ones released with LLVM 7.0.1. This is in anticipation of an update of upstream code, as well as a rework of compiler compatibility. Signed-off-by: Andrei Tatar Reviewed-by: Maria Sfiraiala Reviewed-by: Radu Nichita Approved-by: Razvan Deaconescu Tested-by: Unikraft CI GitHub-Closes: #3 --- diff --git a/include/avxintrin.h b/include/avxintrin.h index 9ab6de7..cb15396 100644 --- a/include/avxintrin.h +++ b/include/avxintrin.h @@ -50,13 +50,8 @@ typedef double __m256d __attribute__((__vector_size__(32))); typedef long long __m256i __attribute__((__vector_size__(32))); /* Define the default attributes for the functions in this file. */ -#ifdef __GNUC__ -#define __DEFAULT_FN_ATTRS __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -#define __DEFAULT_FN_ATTRS128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -#else #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx"), __min_vector_width__(256))) #define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx"), __min_vector_width__(128))) -#endif /* Arithmetic */ /// Adds two 256-bit vectors of [4 x double]. @@ -2035,17 +2030,8 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) /// element is extracted and returned. /// \returns A 64-bit integer containing the extracted 64 bits of extended /// packed data. -#ifdef __clang__ #define _mm256_extract_epi64(X, N) \ (long long)__builtin_ia32_vec_ext_v4di((__v4di)(__m256i)(X), (int)(N)) -#else -#define _mm256_extract_epi64(X, N) \ - (__extension__ \ - ({ \ - __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1); \ - _mm_extract_epi64 (__Y, (N) % 2); \ - })) -#endif #endif /// Takes a [8 x i32] vector and replaces the vector element value @@ -2152,11 +2138,7 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) static __inline __m256d __DEFAULT_FN_ATTRS _mm256_cvtepi32_pd(__m128i __a) { -#ifdef __GNUC__ - return (__m256d)__builtin_ia32_cvtdq2pd256 ((__v4si) __a); -#else return (__m256d)__builtin_convertvector((__v4si)__a, __v4df); -#endif } /// Converts a vector of [8 x i32] into a vector of [8 x float]. @@ -2171,11 +2153,7 @@ _mm256_cvtepi32_pd(__m128i __a) static __inline __m256 __DEFAULT_FN_ATTRS _mm256_cvtepi32_ps(__m256i __a) { -#ifdef __GNUC__ - return (__m256)__builtin_ia32_cvtdq2ps256 ((__v8si) __a); -#else return (__m256)__builtin_convertvector((__v8si)__a, __v8sf); -#endif } /// Converts a 256-bit vector of [4 x double] into a 128-bit vector of @@ -2222,11 +2200,7 @@ _mm256_cvtps_epi32(__m256 __a) static __inline __m256d __DEFAULT_FN_ATTRS _mm256_cvtps_pd(__m128 __a) { -#ifdef __GNUC__ - return (__m256d)__builtin_ia32_cvtps2pd256 ((__v4sf) __a); -#else return (__m256d)__builtin_convertvector((__v4sf)__a, __v4df); -#endif } /// Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4 @@ -2351,11 +2325,7 @@ _mm256_cvtss_f32(__m256 __a) static __inline __m256 __DEFAULT_FN_ATTRS _mm256_movehdup_ps(__m256 __a) { -#ifdef __GNUC__ - return (__m256) __builtin_ia32_movshdup256 ((__v8sf)__a); -#else return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 1, 1, 3, 3, 5, 5, 7, 7); -#endif } /// Moves and duplicates even-indexed values from a 256-bit vector of @@ -2380,11 +2350,7 @@ _mm256_movehdup_ps(__m256 __a) static __inline __m256 __DEFAULT_FN_ATTRS _mm256_moveldup_ps(__m256 __a) { -#ifdef __GNUC__ - return (__m256) __builtin_ia32_movsldup256 ((__v8sf)__a); -#else return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 0, 2, 2, 4, 4, 6, 6); -#endif } /// Moves and duplicates double-precision floating point values from a @@ -2406,11 +2372,7 @@ _mm256_moveldup_ps(__m256 __a) static __inline __m256d __DEFAULT_FN_ATTRS _mm256_movedup_pd(__m256d __a) { -#ifdef __GNUC__ - return (__m256d) __builtin_ia32_unpckhpd256 ((__v4df)__a, (__v4df)__a); -#else return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 0, 2, 2); -#endif } /* Unpack and Interleave */ @@ -2433,11 +2395,7 @@ _mm256_movedup_pd(__m256d __a) static __inline __m256d __DEFAULT_FN_ATTRS _mm256_unpackhi_pd(__m256d __a, __m256d __b) { -#ifdef __GNUC__ - return (__m256d) __builtin_ia32_unpckhpd256 ((__v4df)__a, (__v4df)__b); -#else return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 1, 5, 1+2, 5+2); -#endif } /// Unpacks the even-indexed vector elements from two 256-bit vectors of @@ -2459,11 +2417,7 @@ _mm256_unpackhi_pd(__m256d __a, __m256d __b) static __inline __m256d __DEFAULT_FN_ATTRS _mm256_unpacklo_pd(__m256d __a, __m256d __b) { -#ifdef __GNUC__ - return (__m256d) __builtin_ia32_unpcklpd256 ((__v4df)__a, (__v4df)__b); -#else return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 0, 4, 0+2, 4+2); -#endif } /// Unpacks the 32-bit vector elements 2, 3, 6 and 7 from each of the @@ -2490,11 +2444,7 @@ _mm256_unpacklo_pd(__m256d __a, __m256d __b) static __inline __m256 __DEFAULT_FN_ATTRS _mm256_unpackhi_ps(__m256 __a, __m256 __b) { -#ifdef __GNUC__ - return (__m256) __builtin_ia32_unpckhps256 ((__v8sf)__a, (__v8sf)__b); -#else return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1); -#endif } /// Unpacks the 32-bit vector elements 0, 1, 4 and 5 from each of the @@ -2521,11 +2471,7 @@ _mm256_unpackhi_ps(__m256 __a, __m256 __b) static __inline __m256 __DEFAULT_FN_ATTRS _mm256_unpacklo_ps(__m256 __a, __m256 __b) { -#ifdef __GNUC__ - return (__m256) __builtin_ia32_unpcklps256 ((__v8sf)__a, (__v8sf)__b); -#else return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1); -#endif } /* Bit Test */ @@ -3003,11 +2949,7 @@ _mm256_movemask_ps(__m256 __a) /// \headerfile /// /// This intrinsic corresponds to the VZEROALL instruction. -#ifdef __GNUC__ -static __inline void __DEFAULT_FN_ATTRS -#else static __inline void __attribute__((__always_inline__, __nodebug__, __target__("avx"))) -#endif _mm256_zeroall(void) { __builtin_ia32_vzeroall(); @@ -3018,12 +2960,7 @@ _mm256_zeroall(void) /// \headerfile /// /// This intrinsic corresponds to the VZEROUPPER instruction. -// -#ifdef __GNUC__ -static __inline void __DEFAULT_FN_ATTRS -#else static __inline void __attribute__((__always_inline__, __nodebug__, __target__("avx"))) -#endif _mm256_zeroupper(void) { __builtin_ia32_vzeroupper(); @@ -3102,13 +3039,9 @@ _mm256_broadcast_ss(float const *__a) static __inline __m256d __DEFAULT_FN_ATTRS _mm256_broadcast_pd(__m128d const *__a) { -#ifdef __GNUC__ - return (__m256d) __builtin_ia32_vbroadcastf128_pd256 (__a); -#else __m128d __b = _mm_loadu_pd((const double *)__a); return (__m256d)__builtin_shufflevector((__v2df)__b, (__v2df)__b, 0, 1, 0, 1); -#endif } /// Loads the data from a 128-bit vector of [4 x float] from the @@ -3126,13 +3059,9 @@ _mm256_broadcast_pd(__m128d const *__a) static __inline __m256 __DEFAULT_FN_ATTRS _mm256_broadcast_ps(__m128 const *__a) { -#ifdef __GNUC__ - return (__m256) __builtin_ia32_vbroadcastf128_ps256 (__a); -#else __m128 __b = _mm_loadu_ps((const float *)__a); return (__m256)__builtin_shufflevector((__v4sf)__b, (__v4sf)__b, 0, 1, 2, 3, 0, 1, 2, 3); -#endif } /* SIMD load ops */ @@ -3592,12 +3521,8 @@ _mm_maskstore_ps(float *__p, __m128i __m, __m128 __a) static __inline void __DEFAULT_FN_ATTRS _mm256_stream_si256(__m256i *__a, __m256i __b) { -#ifdef __GNUC__ - __builtin_ia32_movntdq256 ((__v4di *)__a, (__v4di)__b); -#else typedef __v4di __v4di_aligned __attribute__((aligned(32))); __builtin_nontemporal_store((__v4di_aligned)__b, (__v4di_aligned*)__a); -#endif } /// Moves double-precision values from a 256-bit vector of [4 x double] @@ -3616,12 +3541,8 @@ _mm256_stream_si256(__m256i *__a, __m256i __b) static __inline void __DEFAULT_FN_ATTRS _mm256_stream_pd(double *__a, __m256d __b) { -#ifdef __GNUC__ - __builtin_ia32_movntpd256 (__a, (__v4df)__b); -#else typedef __v4df __v4df_aligned __attribute__((aligned(32))); __builtin_nontemporal_store((__v4df_aligned)__b, (__v4df_aligned*)__a); -#endif } /// Moves single-precision floating point values from a 256-bit vector @@ -3641,12 +3562,8 @@ _mm256_stream_pd(double *__a, __m256d __b) static __inline void __DEFAULT_FN_ATTRS _mm256_stream_ps(float *__p, __m256 __a) { -#ifdef __GNUC__ - __builtin_ia32_movntps256 (__p, (__v8sf)__a); -#else typedef __v8sf __v8sf_aligned __attribute__((aligned(32))); __builtin_nontemporal_store((__v8sf_aligned)__a, (__v8sf_aligned*)__p); -#endif } /* Create vectors */ @@ -3660,12 +3577,7 @@ _mm256_stream_ps(float *__p, __m256 __a) static __inline__ __m256d __DEFAULT_FN_ATTRS _mm256_undefined_pd(void) { -#ifdef __GNUC__ - __m256d __X = __X; - return __X; -#else return (__m256d)__builtin_ia32_undef256(); -#endif } /// Create a 256-bit vector of [8 x float] with undefined values. @@ -3678,12 +3590,7 @@ _mm256_undefined_pd(void) static __inline__ __m256 __DEFAULT_FN_ATTRS _mm256_undefined_ps(void) { -#ifdef __GNUC__ - __m256 __X = __X; - return __X; -#else return (__m256)__builtin_ia32_undef256(); -#endif } /// Create a 256-bit integer vector with undefined values. @@ -3696,12 +3603,7 @@ _mm256_undefined_ps(void) static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_undefined_si256(void) { -#ifdef __GNUC__ - __m256i __X = __X; - return __X; -#else return (__m256i)__builtin_ia32_undef256(); -#endif } /// Constructs a 256-bit floating-point vector of [4 x double] @@ -4508,11 +4410,7 @@ _mm256_castsi256_pd(__m256i __a) static __inline __m128d __DEFAULT_FN_ATTRS _mm256_castpd256_pd128(__m256d __a) { -#ifdef __GNUC__ - return (__m128d) __builtin_ia32_pd_pd256 ((__v4df)__a); -#else return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 1); -#endif } /// Returns the lower 128 bits of a 256-bit floating-point vector of @@ -4529,11 +4427,7 @@ _mm256_castpd256_pd128(__m256d __a) static __inline __m128 __DEFAULT_FN_ATTRS _mm256_castps256_ps128(__m256 __a) { -#ifdef __GNUC__ - return (__m128) __builtin_ia32_ps_ps256 ((__v8sf)__a); -#else return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 1, 2, 3); -#endif } /// Truncates a 256-bit integer vector into a 128-bit integer vector. @@ -4549,11 +4443,7 @@ _mm256_castps256_ps128(__m256 __a) static __inline __m128i __DEFAULT_FN_ATTRS _mm256_castsi256_si128(__m256i __a) { -#ifdef __GNUC__ - return (__m128i) __builtin_ia32_si_si256 ((__v8si)__a); -#else return __builtin_shufflevector((__v4di)__a, (__v4di)__a, 0, 1); -#endif } /// Constructs a 256-bit floating-point vector of [4 x double] from a @@ -4574,11 +4464,7 @@ _mm256_castsi256_si128(__m256i __a) static __inline __m256d __DEFAULT_FN_ATTRS _mm256_castpd128_pd256(__m128d __a) { -#ifdef __GNUC__ - return (__m256d) __builtin_ia32_pd256_pd ((__v2df)__a); -#else return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 1, -1, -1); -#endif } /// Constructs a 256-bit floating-point vector of [8 x float] from a @@ -4599,11 +4485,7 @@ _mm256_castpd128_pd256(__m128d __a) static __inline __m256 __DEFAULT_FN_ATTRS _mm256_castps128_ps256(__m128 __a) { -#ifdef __GNUC__ - return (__m256) __builtin_ia32_ps256_ps ((__v4sf)__a); -#else return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1, 2, 3, -1, -1, -1, -1); -#endif } /// Constructs a 256-bit integer vector from a 128-bit integer vector. @@ -4622,11 +4504,62 @@ _mm256_castps128_ps256(__m128 __a) static __inline __m256i __DEFAULT_FN_ATTRS _mm256_castsi128_si256(__m128i __a) { -#ifdef __GNUC__ - return (__m256i) __builtin_ia32_si256_si ((__v4si)__a); -#else return __builtin_shufflevector((__v2di)__a, (__v2di)__a, 0, 1, -1, -1); -#endif +} + +/// Constructs a 256-bit floating-point vector of [4 x double] from a +/// 128-bit floating-point vector of [2 x double]. The lower 128 bits +/// contain the value of the source vector. The upper 128 bits are set +/// to zero. +/// +/// \headerfile +/// +/// This intrinsic has no corresponding instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. +/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits +/// contain the value of the parameter. The upper 128 bits are set to zero. +static __inline __m256d __DEFAULT_FN_ATTRS +_mm256_zextpd128_pd256(__m128d __a) +{ + return __builtin_shufflevector((__v2df)__a, (__v2df)_mm_setzero_pd(), 0, 1, 2, 3); +} + +/// Constructs a 256-bit floating-point vector of [8 x float] from a +/// 128-bit floating-point vector of [4 x float]. The lower 128 bits contain +/// the value of the source vector. The upper 128 bits are set to zero. +/// +/// \headerfile +/// +/// This intrinsic has no corresponding instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. +/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits +/// contain the value of the parameter. The upper 128 bits are set to zero. +static __inline __m256 __DEFAULT_FN_ATTRS +_mm256_zextps128_ps256(__m128 __a) +{ + return __builtin_shufflevector((__v4sf)__a, (__v4sf)_mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7); +} + +/// Constructs a 256-bit integer vector from a 128-bit integer vector. +/// The lower 128 bits contain the value of the source vector. The upper +/// 128 bits are set to zero. +/// +/// \headerfile +/// +/// This intrinsic has no corresponding instruction. +/// +/// \param __a +/// A 128-bit integer vector. +/// \returns A 256-bit integer vector. The lower 128 bits contain the value of +/// the parameter. The upper 128 bits are set to zero. +static __inline __m256i __DEFAULT_FN_ATTRS +_mm256_zextsi128_si256(__m128i __a) +{ + return __builtin_shufflevector((__v2di)__a, (__v2di)_mm_setzero_si128(), 0, 1, 2, 3); } /* @@ -5013,11 +4946,7 @@ _mm256_storeu2_m128i(__m128i *__addr_hi, __m128i *__addr_lo, __m256i __a) static __inline __m256 __DEFAULT_FN_ATTRS _mm256_set_m128 (__m128 __hi, __m128 __lo) { -#ifdef __GNUC__ - return _mm256_insertf128_ps (_mm256_castps128_ps256 (__lo), __hi, 1); -#else return (__m256) __builtin_shufflevector((__v4sf)__lo, (__v4sf)__hi, 0, 1, 2, 3, 4, 5, 6, 7); -#endif } /// Constructs a 256-bit floating-point vector of [4 x double] by @@ -5038,11 +4967,7 @@ _mm256_set_m128 (__m128 __hi, __m128 __lo) static __inline __m256d __DEFAULT_FN_ATTRS _mm256_set_m128d (__m128d __hi, __m128d __lo) { -#ifdef __GNUC__ - return (__m256d) _mm256_insertf128_pd (_mm256_castpd128_pd256 (__lo), __hi, 1); -#else return (__m256d) __builtin_shufflevector((__v2df)__lo, (__v2df)__hi, 0, 1, 2, 3); -#endif } /// Constructs a 256-bit integer vector by concatenating two 128-bit @@ -5062,11 +4987,7 @@ _mm256_set_m128d (__m128d __hi, __m128d __lo) static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set_m128i (__m128i __hi, __m128i __lo) { -#ifdef __GNUC__ - return (__m256i) _mm256_insertf128_si256 (_mm256_castsi128_si256 (__lo), __hi, 1); -#else return (__m256i) __builtin_shufflevector((__v2di)__lo, (__v2di)__hi, 0, 1, 2, 3); -#endif } /// Constructs a 256-bit floating-point vector of [8 x float] by diff --git a/include/emmintrin.h b/include/emmintrin.h index c0573a1..f0ea7cd 100644 --- a/include/emmintrin.h +++ b/include/emmintrin.h @@ -45,15 +45,8 @@ typedef unsigned char __v16qu __attribute__((__vector_size__(16))); typedef signed char __v16qs __attribute__((__vector_size__(16))); /* Define the default attributes for the functions in this file. */ -#ifdef __GNUC__ -#define __DEFAULT_FN_ATTRS __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -#define __DEFAULT_FN_ATTRS_MMX __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -#else #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse2"), __min_vector_width__(128))) #define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, __nodebug__, __target__("mmx,sse2"), __min_vector_width__(64))) -#endif - -#define _MM_SHUFFLE2(x, y) (((x) << 1) | (y)) /// Adds lower double-precision values in both operands and returns the /// sum in the lower 64 bits of the result. The upper 64 bits of the result @@ -1332,12 +1325,8 @@ _mm_cvtpd_ps(__m128d __a) static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtps_pd(__m128 __a) { -#ifdef __GNUC__ - return (__m128d)__builtin_ia32_cvtps2pd ((__v4sf) __a); -#else return (__m128d) __builtin_convertvector( __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1), __v2df); -#endif } /// Converts the lower two integer elements of a 128-bit vector of @@ -1359,12 +1348,8 @@ _mm_cvtps_pd(__m128 __a) static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtepi32_pd(__m128i __a) { -#ifdef __GNUC__ - return (__m128d)__builtin_ia32_cvtdq2pd ((__v4si) __a); -#else return (__m128d) __builtin_convertvector( __builtin_shufflevector((__v4si)__a, (__v4si)__a, 0, 1), __v2df); -#endif } /// Converts the two double-precision floating-point elements of a @@ -1383,11 +1368,7 @@ _mm_cvtepi32_pd(__m128i __a) static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtpd_epi32(__m128d __a) { -#ifdef __GNUC__ - return (__m128i)__builtin_ia32_cvtpd2dq ((__v2df) __a); -#else return __builtin_ia32_cvtpd2dq((__v2df)__a); -#endif } /// Converts the low-order element of a 128-bit vector of [2 x double] @@ -1652,13 +1633,8 @@ _mm_load1_pd(double const *__dp) static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadr_pd(double const *__dp) { -#ifdef __GNUC__ - __m128d __tmp = _mm_load_pd (__dp); - return __builtin_ia32_shufpd (__tmp, __tmp, _MM_SHUFFLE2 (0,1)); -#else __m128d __u = *(__m128d*)__dp; return __builtin_shufflevector((__v2df)__u, (__v2df)__u, 1, 0); -#endif } /// Loads a 128-bit floating-point vector of [2 x double] from an @@ -1791,12 +1767,7 @@ _mm_loadl_pd(__m128d __a, double const *__dp) static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_undefined_pd(void) { -#ifdef __GNUC__ - __m128d __X = __X; - return __X; -#else return (__m128d)__builtin_ia32_undef128(); -#endif } /// Constructs a 128-bit floating-point vector of [2 x double]. The lower @@ -1989,12 +1960,8 @@ _mm_store_pd(double *__dp, __m128d __a) static __inline__ void __DEFAULT_FN_ATTRS _mm_store1_pd(double *__dp, __m128d __a) { -#ifdef __GNUC__ - _mm_store_pd (__dp, __builtin_ia32_shufpd (__a, __a, _MM_SHUFFLE2 (0,0))); -#else __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0); _mm_store_pd(__dp, __a); -#endif } /// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to @@ -2055,12 +2022,8 @@ _mm_storeu_pd(double *__dp, __m128d __a) static __inline__ void __DEFAULT_FN_ATTRS _mm_storer_pd(double *__dp, __m128d __a) { -#ifdef __GNUC__ - _mm_store_pd (__dp, __builtin_ia32_shufpd (__a, __a, _MM_SHUFFLE2 (0,1))); -#else __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 1, 0); *(__m128d *)__dp = __a; -#endif } /// Stores the upper 64 bits of a 128-bit vector of [2 x double] to a @@ -2181,13 +2144,11 @@ _mm_add_epi32(__m128i __a, __m128i __b) /// \param __b /// A 64-bit integer. /// \returns A 64-bit integer containing the sum of both parameters. -#ifndef __GNUC__ static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_add_si64(__m64 __a, __m64 __b) { return (__m64)__builtin_ia32_paddq((__v1di)__a, (__v1di)__b); } -#endif /// Adds the corresponding elements of two 128-bit vectors of [2 x i64], /// saving the lower 64 bits of each sum in the corresponding element of a @@ -2313,15 +2274,11 @@ _mm_adds_epu16(__m128i __a, __m128i __b) static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu8(__m128i __a, __m128i __b) { -#ifdef __GNUC__ - return (__m128i)__builtin_ia32_pavgb128 ((__v16qi)__a, (__v16qi)__b); -#else typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32))); return (__m128i)__builtin_convertvector( ((__builtin_convertvector((__v16qu)__a, __v16hu) + __builtin_convertvector((__v16qu)__b, __v16hu)) + 1) >> 1, __v16qu); -#endif } /// Computes the rounded avarages of corresponding elements of two @@ -2341,15 +2298,11 @@ _mm_avg_epu8(__m128i __a, __m128i __b) static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu16(__m128i __a, __m128i __b) { -#ifdef __GNUC__ - return (__m128i)__builtin_ia32_pavgw128 ((__v8hi)__a, (__v8hi)__b); -#else typedef unsigned int __v8su __attribute__ ((__vector_size__ (32))); return (__m128i)__builtin_convertvector( ((__builtin_convertvector((__v8hu)__a, __v8su) + __builtin_convertvector((__v8hu)__b, __v8su)) + 1) >> 1, __v8hu); -#endif } /// Multiplies the corresponding elements of two 128-bit signed [8 x i16] @@ -2534,11 +2487,7 @@ _mm_mullo_epi16(__m128i __a, __m128i __b) static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_mul_su32(__m64 __a, __m64 __b) { -#ifdef __GNUC__ - return (__m64)__builtin_ia32_pmuludq ((__v2si)__a, (__v2si)__b); -#else return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b); -#endif } /// Multiplies 32-bit unsigned integer values contained in the lower @@ -2649,14 +2598,11 @@ _mm_sub_epi32(__m128i __a, __m128i __b) /// A 64-bit integer vector containing the subtrahend. /// \returns A 64-bit integer vector containing the difference of the values in /// the operands. - -#ifndef __GNUC__ static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_sub_si64(__m64 __a, __m64 __b) { return (__m64)__builtin_ia32_psubq((__v1di)__a, (__v1di)__b); } -#endif /// Subtracts the corresponding elements of two [2 x i64] vectors. /// @@ -3065,18 +3011,11 @@ _mm_sra_epi32(__m128i __a, __m128i __count) /// An immediate value specifying the number of bytes to right-shift operand /// \a a. /// \returns A 128-bit integer vector containing the right-shifted value. -#ifdef __GNUC__ -#define _mm_bsrli_si128(a, n) \ - ((__m128i)__builtin_ia32_psrldqi128 ((__m128i)(a), (int)(n) * 8)) -#define _mm_srli_si128(a, n) \ - ((__m128i)__builtin_ia32_psrldqi128 ((__m128i)(a), (int)(n) * 8)) -#else #define _mm_srli_si128(a, imm) \ (__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm)) + #define _mm_bsrli_si128(a, imm) \ (__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm)) -#endif - /// Right-shifts each of 16-bit values in the 128-bit integer vector /// operand by the specified number of bits. High-order bits are cleared. @@ -3449,11 +3388,7 @@ _mm_cvttsd_si64(__m128d __a) static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtepi32_ps(__m128i __a) { -#ifdef __GNUC__ - return (__m128)__builtin_ia32_cvtdq2ps ((__v4si) __a); -#else return (__m128)__builtin_convertvector((__v4si)__a, __v4sf); -#endif } /// Converts a vector of [4 x float] into a vector of [4 x i32]. @@ -3628,12 +3563,7 @@ _mm_loadl_epi64(__m128i const *__p) static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_undefined_si128(void) { -#ifdef __GNUC__ - __m128i __X = __X; - return __X; -#else return (__m128i)__builtin_ia32_undef128(); -#endif } /// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with @@ -4129,11 +4059,7 @@ _mm_storel_epi64(__m128i *__p, __m128i __a) static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pd(double *__p, __m128d __a) { -#ifdef __GNUC__ - __builtin_ia32_movntpd (__p, (__v2df)__a); -#else __builtin_nontemporal_store((__v2df)__a, (__v2df*)__p); -#endif } /// Stores a 128-bit integer vector to a 128-bit aligned memory location. @@ -4152,11 +4078,7 @@ _mm_stream_pd(double *__p, __m128d __a) static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_si128(__m128i *__p, __m128i __a) { -#ifdef __GNUC__ - __builtin_ia32_movntdq ((__v2di *)__p, (__v2di)__a); -#else __builtin_nontemporal_store((__v2di)__a, (__v2di*)__p); -#endif } /// Stores a 32-bit integer value in the specified memory location. @@ -4172,12 +4094,7 @@ _mm_stream_si128(__m128i *__p, __m128i __a) /// A pointer to the 32-bit memory location used to store the value. /// \param __a /// A 32-bit integer containing the value to be stored. -static __inline__ void -#ifdef __GNUC__ -__attribute__((__gnu_inline__, __always_inline__, __artificial__)) -#else -__attribute__((__always_inline__, __nodebug__, __target__("sse2"))) -#endif +static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("sse2"))) _mm_stream_si32(int *__p, int __a) { __builtin_ia32_movnti(__p, __a); @@ -4197,12 +4114,7 @@ _mm_stream_si32(int *__p, int __a) /// A pointer to the 64-bit memory location used to store the value. /// \param __a /// A 64-bit integer containing the value to be stored. -static __inline__ void -#ifdef __GNUC__ -__attribute__((__gnu_inline__, __always_inline__, __artificial__)) -#else -__attribute__((__always_inline__, __nodebug__, __target__("sse2"))) -#endif +static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("sse2"))) _mm_stream_si64(long long *__p, long long __a) { __builtin_ia32_movnti64(__p, __a); @@ -4526,11 +4438,7 @@ _mm_movemask_epi8(__m128i __a) static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi8(__m128i __a, __m128i __b) { -#ifdef __GNUC__ - return (__m128i)__builtin_ia32_punpckhbw128 ((__v16qi)__a, (__v16qi)__b); -#else return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15); -#endif } /// Unpacks the high-order (index 4-7) values from two 128-bit vectors of @@ -4557,11 +4465,7 @@ _mm_unpackhi_epi8(__m128i __a, __m128i __b) static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi16(__m128i __a, __m128i __b) { -#ifdef __GNUC__ - return (__m128i)__builtin_ia32_punpckhwd128 ((__v8hi)__a, (__v8hi)__b); -#else return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7); -#endif } /// Unpacks the high-order (index 2,3) values from two 128-bit vectors of @@ -4584,11 +4488,7 @@ _mm_unpackhi_epi16(__m128i __a, __m128i __b) static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi32(__m128i __a, __m128i __b) { -#ifdef __GNUC__ - return (__m128i)__builtin_ia32_punpckhdq128 ((__v4si)__a, (__v4si)__b); -#else return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4+2, 3, 4+3); -#endif } /// Unpacks the high-order 64-bit elements from two 128-bit vectors of @@ -4609,11 +4509,7 @@ _mm_unpackhi_epi32(__m128i __a, __m128i __b) static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi64(__m128i __a, __m128i __b) { -#ifdef __GNUC__ - return (__m128i)__builtin_ia32_punpckhqdq128 ((__v2di)__a, (__v2di)__b); -#else return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 1, 2+1); -#endif } /// Unpacks the low-order (index 0-7) values from two 128-bit vectors of @@ -4648,11 +4544,7 @@ _mm_unpackhi_epi64(__m128i __a, __m128i __b) static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi8(__m128i __a, __m128i __b) { -#ifdef __GNUC__ - return (__m128i)__builtin_ia32_punpcklbw128 ((__v16qi)__a, (__v16qi)__b); -#else return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7); -#endif } /// Unpacks the low-order (index 0-3) values from each of the two 128-bit @@ -4680,11 +4572,7 @@ _mm_unpacklo_epi8(__m128i __a, __m128i __b) static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi16(__m128i __a, __m128i __b) { -#ifdef __GNUC__ - return (__m128i)__builtin_ia32_punpcklwd128 ((__v8hi)__a, (__v8hi)__b); -#else return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3); -#endif } /// Unpacks the low-order (index 0,1) values from two 128-bit vectors of @@ -4707,11 +4595,7 @@ _mm_unpacklo_epi16(__m128i __a, __m128i __b) static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi32(__m128i __a, __m128i __b) { -#ifdef __GNUC__ - return (__m128i)__builtin_ia32_punpckldq128 ((__v4si)__a, (__v4si)__b); -#else return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4+0, 1, 4+1); -#endif } /// Unpacks the low-order 64-bit elements from two 128-bit vectors of @@ -4732,11 +4616,7 @@ _mm_unpacklo_epi32(__m128i __a, __m128i __b) static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi64(__m128i __a, __m128i __b) { -#ifdef __GNUC__ - return (__m128i)__builtin_ia32_punpcklqdq128 ((__v2di)__a, (__v2di)__b); -#else return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 0, 2+0); -#endif } /// Returns the lower 64 bits of a 128-bit integer vector as a 64-bit @@ -4788,11 +4668,7 @@ _mm_movpi64_epi64(__m64 __a) static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_move_epi64(__m128i __a) { -#ifdef __GNUC__ - return (__m128i)__builtin_ia32_movq128 ((__v2di) __a); -#else return __builtin_shufflevector((__v2di)__a, _mm_setzero_si128(), 0, 2); -#endif } /// Unpacks the high-order 64-bit elements from two 128-bit vectors of @@ -4813,11 +4689,7 @@ _mm_move_epi64(__m128i __a) static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpackhi_pd(__m128d __a, __m128d __b) { -#ifdef __GNUC__ - return (__m128d)__builtin_ia32_unpckhpd ((__v2df)__a, (__v2df)__b); -#else return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 1, 2+1); -#endif } /// Unpacks the low-order 64-bit elements from two 128-bit vectors @@ -4838,11 +4710,7 @@ _mm_unpackhi_pd(__m128d __a, __m128d __b) static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpacklo_pd(__m128d __a, __m128d __b) { -#ifdef __GNUC__ - return (__m128d)__builtin_ia32_unpcklpd ((__v2df)__a, (__v2df)__b); -#else return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 0, 2+0); -#endif } /// Extracts the sign bits of the double-precision values in the 128-bit @@ -4999,26 +4867,29 @@ _mm_castsi128_pd(__m128i __a) extern "C" { #endif +/// Indicates that a spin loop is being executed for the purposes of +/// optimizing power consumption during the loop. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the PAUSE instruction. +/// +void _mm_pause(void); + #if defined(__cplusplus) } // extern "C" #endif #undef __DEFAULT_FN_ATTRS #undef __DEFAULT_FN_ATTRS_MMX -#ifndef _MM_DENORMALS_ZERO_ON +#define _MM_SHUFFLE2(x, y) (((x) << 1) | (y)) + #define _MM_DENORMALS_ZERO_ON (0x0040) -#endif -#ifndef _MM_DENORMALS_ZERO_OFF #define _MM_DENORMALS_ZERO_OFF (0x0000) -#endif -#ifndef _MM_DENORMALS_ZERO_MASK #define _MM_DENORMALS_ZERO_MASK (0x0040) -#endif -#ifndef _MM_GET_DENORMALS_ZERO_MODE #define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK) #define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x))) -#endif #endif /* __EMMINTRIN_H */ diff --git a/include/mmintrin.h b/include/mmintrin.h index a5c2829..a735399 100644 --- a/include/mmintrin.h +++ b/include/mmintrin.h @@ -24,24 +24,15 @@ #ifndef __MMINTRIN_H #define __MMINTRIN_H -#ifdef __GNUC__ -typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__)); -#else typedef long long __m64 __attribute__((__vector_size__(8))); -#endif -typedef int __m64_u __attribute__ ((__vector_size__ (8), __may_alias__, __aligned__ (1))); typedef long long __v1di __attribute__((__vector_size__(8))); typedef int __v2si __attribute__((__vector_size__(8))); typedef short __v4hi __attribute__((__vector_size__(8))); typedef char __v8qi __attribute__((__vector_size__(8))); -typedef float __v2sf __attribute__ ((__vector_size__ (8))); + /* Define the default attributes for the functions in this file. */ -#ifdef __GNUC__ -#define __DEFAULT_FN_ATTRS __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -#else #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("mmx"), __min_vector_width__(64))) -#endif /// Clears the MMX state by setting the state of the x87 stack registers /// to empty. @@ -50,12 +41,7 @@ typedef float __v2sf __attribute__ ((__vector_size__ (8))); /// /// This intrinsic corresponds to the EMMS instruction. /// -static __inline__ void -#ifdef __GNUC__ -__attribute__((__gnu_inline__, __always_inline__, __artificial__)) -#else -__attribute__((__always_inline__, __nodebug__, __target__("mmx"))) -#endif +static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("mmx"))) _mm_empty(void) { __builtin_ia32_emms(); @@ -757,11 +743,7 @@ _mm_mullo_pi16(__m64 __m1, __m64 __m2) static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_sll_pi16(__m64 __m, __m64 __count) { -#ifdef __GNUC__ - return (__m64) __builtin_ia32_psllw ((__v4hi)__m, (__v4hi)__count); -#else return (__m64)__builtin_ia32_psllw((__v4hi)__m, __count); -#endif } /// Left-shifts each 16-bit signed integer element of a 64-bit integer @@ -806,7 +788,7 @@ _mm_slli_pi16(__m64 __m, int __count) static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_sll_pi32(__m64 __m, __m64 __count) { - return (__m64)__builtin_ia32_pslld((__v2si)__m, (__v2si)__count); + return (__m64)__builtin_ia32_pslld((__v2si)__m, __count); } /// Left-shifts each 32-bit signed integer element of a 64-bit integer @@ -848,7 +830,7 @@ _mm_slli_pi32(__m64 __m, int __count) static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_sll_si64(__m64 __m, __m64 __count) { - return (__m64)__builtin_ia32_psllq((__v1di)__m, (__v1di)__count); + return (__m64)__builtin_ia32_psllq((__v1di)__m, __count); } /// Left-shifts the first parameter, which is a 64-bit integer, by the @@ -892,7 +874,7 @@ _mm_slli_si64(__m64 __m, int __count) static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_sra_pi16(__m64 __m, __m64 __count) { - return (__m64)__builtin_ia32_psraw((__v4hi)__m, (__v4hi)__count); + return (__m64)__builtin_ia32_psraw((__v4hi)__m, __count); } /// Right-shifts each 16-bit integer element of a 64-bit integer vector @@ -939,7 +921,7 @@ _mm_srai_pi16(__m64 __m, int __count) static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_sra_pi32(__m64 __m, __m64 __count) { - return (__m64)__builtin_ia32_psrad((__v2si)__m, (__v2si)__count); + return (__m64)__builtin_ia32_psrad((__v2si)__m, __count); } /// Right-shifts each 32-bit integer element of a 64-bit integer vector @@ -985,7 +967,7 @@ _mm_srai_pi32(__m64 __m, int __count) static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_srl_pi16(__m64 __m, __m64 __count) { - return (__m64)__builtin_ia32_psrlw((__v4hi)__m, (__v4hi)__count); + return (__m64)__builtin_ia32_psrlw((__v4hi)__m, __count); } /// Right-shifts each 16-bit integer element of a 64-bit integer vector @@ -1030,7 +1012,7 @@ _mm_srli_pi16(__m64 __m, int __count) static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_srl_pi32(__m64 __m, __m64 __count) { - return (__m64)__builtin_ia32_psrld((__v2si)__m, (__v2si)__count); + return (__m64)__builtin_ia32_psrld((__v2si)__m, __count); } /// Right-shifts each 32-bit integer element of a 64-bit integer vector @@ -1072,7 +1054,7 @@ _mm_srli_pi32(__m64 __m, int __count) static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_srl_si64(__m64 __m, __m64 __count) { - return (__m64)__builtin_ia32_psrlq((__v1di)__m, (__v1di)__count); + return (__m64)__builtin_ia32_psrlq((__v1di)__m, __count); } /// Right-shifts the first parameter, which is a 64-bit integer, by the @@ -1111,7 +1093,7 @@ _mm_srli_si64(__m64 __m, int __count) static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_and_si64(__m64 __m1, __m64 __m2) { - return __builtin_ia32_pand(__m1, __m2); + return __builtin_ia32_pand((__v1di)__m1, (__v1di)__m2); } /// Performs a bitwise NOT of the first 64-bit integer vector, and then @@ -1132,11 +1114,7 @@ _mm_and_si64(__m64 __m1, __m64 __m2) static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_andnot_si64(__m64 __m1, __m64 __m2) { -#ifdef __GNUC__ - return __builtin_ia32_pandn (__m1, __m2); -#else return __builtin_ia32_pandn((__v1di)__m1, (__v1di)__m2); -#endif } /// Performs a bitwise OR of two 64-bit integer vectors. @@ -1154,11 +1132,7 @@ _mm_andnot_si64(__m64 __m1, __m64 __m2) static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_or_si64(__m64 __m1, __m64 __m2) { -#ifdef __GNUC__ - return __builtin_ia32_por(__m1, __m2); -#else return __builtin_ia32_por((__v1di)__m1, (__v1di)__m2); -#endif } /// Performs a bitwise exclusive OR of two 64-bit integer vectors. @@ -1176,7 +1150,7 @@ _mm_or_si64(__m64 __m1, __m64 __m2) static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_xor_si64(__m64 __m1, __m64 __m2) { - return __builtin_ia32_pxor (__m1, __m2); + return __builtin_ia32_pxor((__v1di)__m1, (__v1di)__m2); } /// Compares the 8-bit integer elements of two 64-bit integer vectors of diff --git a/include/pmmintrin.h b/include/pmmintrin.h index 24b7d68..7e1a9ea 100644 --- a/include/pmmintrin.h +++ b/include/pmmintrin.h @@ -27,13 +27,8 @@ #include /* Define the default attributes for the functions in this file. */ -#ifdef __GNUC__ -#define __DEFAULT_FN_ATTRS \ - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -#else #define __DEFAULT_FN_ATTRS \ __attribute__((__always_inline__, __nodebug__, __target__("sse3"), __min_vector_width__(128))) -#endif /// Loads data from an unaligned memory location to elements in a 128-bit /// vector. @@ -139,11 +134,7 @@ _mm_hsub_ps(__m128 __a, __m128 __b) static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_movehdup_ps(__m128 __a) { -#ifdef __GNUC__ - return (__m128) __builtin_ia32_movshdup ((__v4sf)__a); -#else return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 1, 1, 3, 3); -#endif } /// Duplicates even-indexed values from a 128-bit vector of @@ -164,11 +155,7 @@ _mm_movehdup_ps(__m128 __a) static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_moveldup_ps(__m128 __a) { -#ifdef __GNUC__ - return (__m128) __builtin_ia32_movsldup ((__v4sf)__a); -#else return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 2, 2); -#endif } /// Adds the even-indexed values and subtracts the odd-indexed values of @@ -269,11 +256,7 @@ _mm_hsub_pd(__m128d __a, __m128d __b) static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_movedup_pd(__m128d __a) { -#ifdef __GNUC__ - return _mm_shuffle_pd (__a, __a, _MM_SHUFFLE2 (0,0)); -#else return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0); -#endif } /// Establishes a linear address memory range to be monitored and puts diff --git a/include/popcntintrin.h b/include/popcntintrin.h index bba0573..75ceab9 100644 --- a/include/popcntintrin.h +++ b/include/popcntintrin.h @@ -25,11 +25,7 @@ #define __POPCNTINTRIN_H /* Define the default attributes for the functions in this file. */ -#ifdef __GNUC__ -#define __DEFAULT_FN_ATTRS __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -#else #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("popcnt"))) -#endif /// Counts the number of bits in the source operand having a value of 1. /// diff --git a/include/smmintrin.h b/include/smmintrin.h index 4f1d637..4806b3e 100644 --- a/include/smmintrin.h +++ b/include/smmintrin.h @@ -27,11 +27,7 @@ #include /* Define the default attributes for the functions in this file. */ -#ifdef __GNUC__ -#define __DEFAULT_FN_ATTRS __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -#else #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4.1"), __min_vector_width__(128))) -#endif /* SSE4 Rounding macros. */ #define _MM_FROUND_TO_NEAREST_INT 0x00 @@ -662,11 +658,7 @@ _mm_mul_epi32 (__m128i __V1, __m128i __V2) static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_stream_load_si128 (__m128i const *__V) { -#ifdef __GNUC__ - return (__m128i) __builtin_ia32_movntdqa ((__v2di *) __V); -#else return (__m128i) __builtin_nontemporal_load ((const __v2di *) __V); -#endif } /* SSE4 Packed Integer Min/Max Instructions. */ @@ -1251,13 +1243,9 @@ _mm_cmpeq_epi64(__m128i __V1, __m128i __V2) static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi16(__m128i __V) { -#ifdef __GNUC__ - return (__m128i) __builtin_ia32_pmovsxbw128 ((__v16qi)__V); -#else /* This function always performs a signed extension, but __v16qi is a char which may be signed or unsigned, so use __v16qs. */ return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi); -#endif } /// Sign-extends each of the lower four 8-bit integer elements of a @@ -1278,11 +1266,7 @@ _mm_cvtepi8_epi32(__m128i __V) { /* This function always performs a signed extension, but __v16qi is a char which may be signed or unsigned, so use __v16qs. */ -#ifdef __GNUC__ - return (__m128i) __builtin_ia32_pmovsxbd128 ((__v16qi)__V); -#else return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4si); -#endif } /// Sign-extends each of the lower two 8-bit integer elements of a @@ -1301,13 +1285,9 @@ _mm_cvtepi8_epi32(__m128i __V) static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi64(__m128i __V) { -#ifdef __GNUC__ - return (__m128i) __builtin_ia32_pmovsxbq128 ((__v16qi)__V); -#else /* This function always performs a signed extension, but __v16qi is a char which may be signed or unsigned, so use __v16qs. */ return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1), __v2di); -#endif } /// Sign-extends each of the lower four 16-bit integer elements of a @@ -1326,11 +1306,7 @@ _mm_cvtepi8_epi64(__m128i __V) static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi16_epi32(__m128i __V) { -#ifdef __GNUC__ - return (__m128i) __builtin_ia32_pmovsxwd128 ((__v8hi)__V); -#else return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4si); -#endif } /// Sign-extends each of the lower two 16-bit integer elements of a @@ -1349,11 +1325,7 @@ _mm_cvtepi16_epi32(__m128i __V) static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi16_epi64(__m128i __V) { -#ifdef __GNUC__ - return (__m128i) __builtin_ia32_pmovsxwq128 ((__v8hi)__V); -#else return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1), __v2di); -#endif } /// Sign-extends each of the lower two 32-bit integer elements of a @@ -1372,11 +1344,7 @@ _mm_cvtepi16_epi64(__m128i __V) static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi32_epi64(__m128i __V) { -#ifdef __GNUC__ - return (__m128i) __builtin_ia32_pmovsxdq128 ((__v4si)__V); -#else return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v4si)__V, (__v4si)__V, 0, 1), __v2di); -#endif } /* SSE4 Packed Integer Zero-Extension. */ @@ -1396,11 +1364,7 @@ _mm_cvtepi32_epi64(__m128i __V) static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu8_epi16(__m128i __V) { -#ifdef __GNUC__ - return (__m128i) __builtin_ia32_pmovzxbw128 ((__v16qi)__V); -#else return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi); -#endif } /// Zero-extends each of the lower four 8-bit integer elements of a @@ -1419,11 +1383,7 @@ _mm_cvtepu8_epi16(__m128i __V) static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu8_epi32(__m128i __V) { -#ifdef __GNUC__ - return (__m128i) __builtin_ia32_pmovzxbd128 ((__v16qi)__V); -#else return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4si); -#endif } /// Zero-extends each of the lower two 8-bit integer elements of a @@ -1442,11 +1402,7 @@ _mm_cvtepu8_epi32(__m128i __V) static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu8_epi64(__m128i __V) { -#ifdef __GNUC__ - return (__m128i) __builtin_ia32_pmovzxbq128 ((__v16qi)__V); -#else return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1), __v2di); -#endif } /// Zero-extends each of the lower four 16-bit integer elements of a @@ -1465,11 +1421,7 @@ _mm_cvtepu8_epi64(__m128i __V) static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu16_epi32(__m128i __V) { -#ifdef __GNUC__ - return (__m128i) __builtin_ia32_pmovzxwd128 ((__v8hi)__V); -#else return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4si); -#endif } /// Zero-extends each of the lower two 16-bit integer elements of a @@ -1488,11 +1440,7 @@ _mm_cvtepu16_epi32(__m128i __V) static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu16_epi64(__m128i __V) { -#ifdef __GNUC__ - return (__m128i) __builtin_ia32_pmovzxwq128 ((__v8hi)__V); -#else return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1), __v2di); -#endif } /// Zero-extends each of the lower two 32-bit integer elements of a @@ -1511,11 +1459,7 @@ _mm_cvtepu16_epi64(__m128i __V) static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu32_epi64(__m128i __V) { -#ifdef __GNUC__ - return (__m128i) __builtin_ia32_pmovzxdq128 ((__v4si)__V); -#else return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v4su)__V, (__v4su)__V, 0, 1), __v2di); -#endif } /* SSE4 Pack with Unsigned Saturation. */ @@ -1612,11 +1556,7 @@ _mm_minpos_epu16(__m128i __V) so we'll do the same. */ #undef __DEFAULT_FN_ATTRS -#ifdef __GNUC__ -#define __DEFAULT_FN_ATTRS __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -#else #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4.2"))) -#endif /* These specify the type of data that we're comparing. */ #define _SIDD_UBYTE_OPS 0x00 diff --git a/include/tmmintrin.h b/include/tmmintrin.h index 7c9ece6..734cd39 100644 --- a/include/tmmintrin.h +++ b/include/tmmintrin.h @@ -27,13 +27,8 @@ #include /* Define the default attributes for the functions in this file. */ -#ifdef __GNUC__ -#define __DEFAULT_FN_ATTRS __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -#define __DEFAULT_FN_ATTRS_MMX __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -#else #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("ssse3"), __min_vector_width__(64))) #define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, __nodebug__, __target__("mmx,ssse3"), __min_vector_width__(64))) -#endif /// Computes the absolute value of each of the packed 8-bit signed /// integers in the source operand and stores the 8-bit unsigned integer @@ -163,15 +158,9 @@ _mm_abs_epi32(__m128i __a) /// An immediate operand specifying how many bytes to right-shift the result. /// \returns A 128-bit integer vector containing the concatenated right-shifted /// value. -#ifdef __GNUC__ -#define _mm_alignr_epi8(a, b, n) \ - (__m128i)__builtin_ia32_palignr128((__v2di)(__m128i)(a), \ - (__v2di)(__m128i)(b), (n)) -#else #define _mm_alignr_epi8(a, b, n) \ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(a), \ (__v16qi)(__m128i)(b), (n)) -#endif /// Concatenates the two 64-bit integer vector operands, and right-shifts /// the result by the number of bytes specified in the immediate operand. diff --git a/include/xmmintrin.h b/include/xmmintrin.h index a630e48..17af172 100644 --- a/include/xmmintrin.h +++ b/include/xmmintrin.h @@ -25,15 +25,6 @@ #define __XMMINTRIN_H #include -/* Define the default attributes for the functions in this file. */ -#ifdef __GNUC__ -#define __DEFAULT_FN_ATTRS __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -#define __DEFAULT_FN_ATTRS_MMX __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -#else -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse2"), __min_vector_width__(128))) -#define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, __nodebug__, __target__("mmx,sse2"), __min_vector_width__(64))) -#endif - typedef int __v4si __attribute__((__vector_size__(16))); typedef float __v4sf __attribute__((__vector_size__(16))); @@ -48,8 +39,9 @@ typedef unsigned int __v4su __attribute__((__vector_size__(16))); #include #endif -#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w)) - +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse"), __min_vector_width__(128))) +#define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, __nodebug__, __target__("mmx,sse"), __min_vector_width__(64))) /// Adds the 32-bit float values in the low-order bits of the operands. /// @@ -630,14 +622,9 @@ _mm_cmple_ps(__m128 __a, __m128 __b) static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpgt_ss(__m128 __a, __m128 __b) { -#ifdef __GNUC__ - return (__m128) __builtin_ia32_movss ((__v4sf) __a, (__v4sf) - __builtin_ia32_cmpltss ((__v4sf) __b, (__v4sf) __a)); -#else return (__m128)__builtin_shufflevector((__v4sf)__a, (__v4sf)__builtin_ia32_cmpltss((__v4sf)__b, (__v4sf)__a), 4, 1, 2, 3); -#endif } /// Compares each of the corresponding 32-bit float values of the @@ -680,14 +667,9 @@ _mm_cmpgt_ps(__m128 __a, __m128 __b) static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpge_ss(__m128 __a, __m128 __b) { -#ifdef __GNUC__ - return (__m128) __builtin_ia32_movss ((__v4sf) __a, (__v4sf) - __builtin_ia32_cmpless ((__v4sf) __b, (__v4sf) __a)); -#else return (__m128)__builtin_shufflevector((__v4sf)__a, (__v4sf)__builtin_ia32_cmpless((__v4sf)__b, (__v4sf)__a), 4, 1, 2, 3); -#endif } /// Compares each of the corresponding 32-bit float values of the @@ -862,15 +844,9 @@ _mm_cmpnle_ps(__m128 __a, __m128 __b) static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpngt_ss(__m128 __a, __m128 __b) { - -#ifdef __GNUC__ - return (__m128) __builtin_ia32_movss ((__v4sf) __a, (__v4sf) - __builtin_ia32_cmpnltss ((__v4sf) __b, (__v4sf) __a)); -#else return (__m128)__builtin_shufflevector((__v4sf)__a, (__v4sf)__builtin_ia32_cmpnltss((__v4sf)__b, (__v4sf)__a), 4, 1, 2, 3); -#endif } /// Compares each of the corresponding 32-bit float values of the @@ -915,14 +891,9 @@ _mm_cmpngt_ps(__m128 __a, __m128 __b) static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnge_ss(__m128 __a, __m128 __b) { -#ifdef __GNUC__ - return (__m128) __builtin_ia32_movss ((__v4sf) __a, (__v4sf) - __builtin_ia32_cmpnless ((__v4sf) __b, (__v4sf) __a)); -#else return (__m128)__builtin_shufflevector((__v4sf)__a, (__v4sf)__builtin_ia32_cmpnless((__v4sf)__b, (__v4sf)__a), 4, 1, 2, 3); -#endif } /// Compares each of the corresponding 32-bit float values of the @@ -1664,9 +1635,6 @@ _mm_cvtss_f32(__m128 __a) static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadh_pi(__m128 __a, const __m64 *__p) { -#ifdef __GNUC__ - return (__m128) __builtin_ia32_loadhps ((__v4sf)__a, (const __v2sf *)__p); -#else typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8))); struct __mm_loadh_pi_struct { __mm_loadh_pi_v2f32 __u; @@ -1674,7 +1642,6 @@ _mm_loadh_pi(__m128 __a, const __m64 *__p) __mm_loadh_pi_v2f32 __b = ((struct __mm_loadh_pi_struct*)__p)->__u; __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1); return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5); -#endif } /// Loads two packed float values from the address \a __p into the @@ -1695,9 +1662,6 @@ _mm_loadh_pi(__m128 __a, const __m64 *__p) static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadl_pi(__m128 __a, const __m64 *__p) { -#ifdef __GNUC__ - return (__m128) __builtin_ia32_loadlps ((__v4sf)__a, (const __v2sf *)__p); -#else typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8))); struct __mm_loadl_pi_struct { __mm_loadl_pi_v2f32 __u; @@ -1705,7 +1669,6 @@ _mm_loadl_pi(__m128 __a, const __m64 *__p) __mm_loadl_pi_v2f32 __b = ((struct __mm_loadl_pi_struct*)__p)->__u; __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1); return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3); -#endif } /// Constructs a 128-bit floating-point vector of [4 x float]. The lower @@ -1810,12 +1773,8 @@ _mm_loadu_ps(const float *__p) static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadr_ps(const float *__p) { -#ifdef __GNUC__ - return (__m128) __builtin_ia32_shufps (*(__v4sf *)__p, *(__v4sf *)__p, _MM_SHUFFLE (0,1,2,3)); -#else __m128 __a = _mm_load_ps(__p); return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0); -#endif } /// Create a 128-bit vector of [4 x float] with undefined values. @@ -1828,12 +1787,7 @@ _mm_loadr_ps(const float *__p) static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_undefined_ps(void) { -#ifdef __GNUC__ - __m128 __X = __X; - return __X; -#else return (__m128)__builtin_ia32_undef128(); -#endif } /// Constructs a 128-bit floating-point vector of [4 x float]. The lower @@ -1977,11 +1931,7 @@ _mm_setzero_ps(void) static __inline__ void __DEFAULT_FN_ATTRS _mm_storeh_pi(__m64 *__p, __m128 __a) { -#ifdef __GNUC__ - __builtin_ia32_storehps((__v2sf *)__p, (__v4sf)__a); -#else __builtin_ia32_storehps((__v2si *)__p, (__v4sf)__a); -#endif } /// Stores the lower 64 bits of a 128-bit vector of [4 x float] to a @@ -1998,11 +1948,7 @@ _mm_storeh_pi(__m64 *__p, __m128 __a) static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_pi(__m64 *__p, __m128 __a) { -#ifdef __GNUC__ - __builtin_ia32_storelps ((__v2sf *)__p, (__v4sf)__a); -#else __builtin_ia32_storelps((__v2si *)__p, (__v4sf)__a); -#endif } /// Stores the lower 32 bits of a 128-bit vector of [4 x float] to a @@ -2080,11 +2026,7 @@ _mm_store_ps(float *__p, __m128 __a) static __inline__ void __DEFAULT_FN_ATTRS _mm_store1_ps(float *__p, __m128 __a) { -#ifdef __GNUC__ - __a = (__m128)__builtin_ia32_shufps((__v4sf)__a, (__v4sf)__a, _MM_SHUFFLE (0,0,0,0)); -#else - __a = (__m128)__builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 0, 0); -#endif + __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 0, 0); _mm_store_ps(__p, __a); } @@ -2123,11 +2065,7 @@ _mm_store_ps1(float *__p, __m128 __a) static __inline__ void __DEFAULT_FN_ATTRS _mm_storer_ps(float *__p, __m128 __a) { -#ifdef __GNUC__ - __a = __builtin_ia32_shufps ((__v4sf)__a, (__v4sf)__a, _MM_SHUFFLE (0,1,2,3)); -#else __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0); -#endif _mm_store_ps(__p, __a); } @@ -2185,11 +2123,7 @@ _mm_storer_ps(float *__p, __m128 __a) static __inline__ void __DEFAULT_FN_ATTRS_MMX _mm_stream_pi(__m64 *__p, __m64 __a) { -#ifdef __GNUC__ - __builtin_ia32_movntq ((unsigned long long *)__p, (unsigned long long)__a); -#else __builtin_ia32_movntq(__p, __a); -#endif } /// Moves packed float values from a 128-bit vector of [4 x float] to a @@ -2208,11 +2142,7 @@ _mm_stream_pi(__m64 *__p, __m64 __a) static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_ps(float *__p, __m128 __a) { -#ifdef __GNUC__ - __builtin_ia32_movntps (__p, (__v4sf)__a); -#else __builtin_nontemporal_store((__v4sf)__a, (__v4sf*)__p); -#endif } #if defined(__cplusplus) @@ -2697,11 +2627,7 @@ void _mm_setcsr(unsigned int __i); static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_unpackhi_ps(__m128 __a, __m128 __b) { -#ifdef __GNUC__ - return (__m128) __builtin_ia32_unpckhps ((__v4sf)__a, (__v4sf)__b); -#else return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 2, 6, 3, 7); -#endif } /// Unpacks the low-order (index 0,1) values from two 128-bit vectors of @@ -2723,11 +2649,7 @@ _mm_unpackhi_ps(__m128 __a, __m128 __b) static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_unpacklo_ps(__m128 __a, __m128 __b) { -#ifdef __GNUC__ - return (__m128) __builtin_ia32_unpcklps ((__v4sf)__a, (__v4sf)__b); -#else return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 4, 1, 5); -#endif } /// Constructs a 128-bit floating-point vector of [4 x float]. The lower @@ -2771,11 +2693,7 @@ _mm_move_ss(__m128 __a, __m128 __b) static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_movehl_ps(__m128 __a, __m128 __b) { -#ifdef __GNUC__ - return (__m128) __builtin_ia32_movhlps ((__v4sf)__a, (__v4sf)__b); -#else return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 6, 7, 2, 3); -#endif } /// Constructs a 128-bit floating-point vector of [4 x float]. The lower @@ -2796,11 +2714,7 @@ _mm_movehl_ps(__m128 __a, __m128 __b) static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_movelh_ps(__m128 __a, __m128 __b) { -#ifdef __GNUC__ - return (__m128) __builtin_ia32_movlhps ((__v4sf)__a, (__v4sf)__b); -#else return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 1, 4, 5); -#endif } /// Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x @@ -3019,6 +2933,7 @@ _mm_movemask_ps(__m128 __a) #define _MM_ALIGN16 __attribute__((aligned(16))) +#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w)) #define _MM_EXCEPT_INVALID (0x0001) #define _MM_EXCEPT_DENORM (0x0002) @@ -3086,42 +3001,12 @@ do { \ #define _m_ _mm_ #define _m_ _mm_ -/// Indicates that a spin loop is being executed for the purposes of -/// optimizing power consumption during the loop. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the PAUSE instruction. -/// -static inline void __DEFAULT_FN_ATTRS _mm_pause(void) -{ - __builtin_ia32_pause(); -} - #undef __DEFAULT_FN_ATTRS #undef __DEFAULT_FN_ATTRS_MMX -/* Set the control register to I. */ -extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_setcsr (unsigned int __I) -{ - __builtin_ia32_ldmxcsr (__I); -} - -/* Return the contents of the control register. */ -extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_getcsr (void) -{ - return __builtin_ia32_stmxcsr (); -} - /* Ugly hack for backwards-compatibility (compatible with gcc) */ -#ifdef __GNUC__ -#include -#else #if defined(__SSE2__) && !__building_module(_Builtin_intrinsics) #include #endif -#endif #endif /* __XMMINTRIN_H */