typedef long long __m256i __attribute__((__vector_size__(32)));
/* Define the default attributes for the functions in this file. */
-#ifdef __GNUC__
-#define __DEFAULT_FN_ATTRS __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-#define __DEFAULT_FN_ATTRS128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-#else
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx"), __min_vector_width__(256)))
#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx"), __min_vector_width__(128)))
-#endif
/* Arithmetic */
/// Adds two 256-bit vectors of [4 x double].
/// element is extracted and returned.
/// \returns A 64-bit integer containing the extracted 64 bits of extended
/// packed data.
-#ifdef __clang__
#define _mm256_extract_epi64(X, N) \
(long long)__builtin_ia32_vec_ext_v4di((__v4di)(__m256i)(X), (int)(N))
-#else
-#define _mm256_extract_epi64(X, N) \
- (__extension__ \
- ({ \
- __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1); \
- _mm_extract_epi64 (__Y, (N) % 2); \
- }))
-#endif
#endif
/// Takes a [8 x i32] vector and replaces the vector element value
static __inline __m256d __DEFAULT_FN_ATTRS
_mm256_cvtepi32_pd(__m128i __a)
{
-#ifdef __GNUC__
- return (__m256d)__builtin_ia32_cvtdq2pd256 ((__v4si) __a);
-#else
return (__m256d)__builtin_convertvector((__v4si)__a, __v4df);
-#endif
}
/// Converts a vector of [8 x i32] into a vector of [8 x float].
static __inline __m256 __DEFAULT_FN_ATTRS
_mm256_cvtepi32_ps(__m256i __a)
{
-#ifdef __GNUC__
- return (__m256)__builtin_ia32_cvtdq2ps256 ((__v8si) __a);
-#else
return (__m256)__builtin_convertvector((__v8si)__a, __v8sf);
-#endif
}
/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of
static __inline __m256d __DEFAULT_FN_ATTRS
_mm256_cvtps_pd(__m128 __a)
{
-#ifdef __GNUC__
- return (__m256d)__builtin_ia32_cvtps2pd256 ((__v4sf) __a);
-#else
return (__m256d)__builtin_convertvector((__v4sf)__a, __v4df);
-#endif
}
/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4
static __inline __m256 __DEFAULT_FN_ATTRS
_mm256_movehdup_ps(__m256 __a)
{
-#ifdef __GNUC__
- return (__m256) __builtin_ia32_movshdup256 ((__v8sf)__a);
-#else
return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 1, 1, 3, 3, 5, 5, 7, 7);
-#endif
}
/// Moves and duplicates even-indexed values from a 256-bit vector of
static __inline __m256 __DEFAULT_FN_ATTRS
_mm256_moveldup_ps(__m256 __a)
{
-#ifdef __GNUC__
- return (__m256) __builtin_ia32_movsldup256 ((__v8sf)__a);
-#else
return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 0, 2, 2, 4, 4, 6, 6);
-#endif
}
/// Moves and duplicates double-precision floating point values from a
static __inline __m256d __DEFAULT_FN_ATTRS
_mm256_movedup_pd(__m256d __a)
{
-#ifdef __GNUC__
- return (__m256d) __builtin_ia32_unpckhpd256 ((__v4df)__a, (__v4df)__a);
-#else
return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 0, 2, 2);
-#endif
}
/* Unpack and Interleave */
static __inline __m256d __DEFAULT_FN_ATTRS
_mm256_unpackhi_pd(__m256d __a, __m256d __b)
{
-#ifdef __GNUC__
- return (__m256d) __builtin_ia32_unpckhpd256 ((__v4df)__a, (__v4df)__b);
-#else
return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 1, 5, 1+2, 5+2);
-#endif
}
/// Unpacks the even-indexed vector elements from two 256-bit vectors of
static __inline __m256d __DEFAULT_FN_ATTRS
_mm256_unpacklo_pd(__m256d __a, __m256d __b)
{
-#ifdef __GNUC__
- return (__m256d) __builtin_ia32_unpcklpd256 ((__v4df)__a, (__v4df)__b);
-#else
return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 0, 4, 0+2, 4+2);
-#endif
}
/// Unpacks the 32-bit vector elements 2, 3, 6 and 7 from each of the
static __inline __m256 __DEFAULT_FN_ATTRS
_mm256_unpackhi_ps(__m256 __a, __m256 __b)
{
-#ifdef __GNUC__
- return (__m256) __builtin_ia32_unpckhps256 ((__v8sf)__a, (__v8sf)__b);
-#else
return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
-#endif
}
/// Unpacks the 32-bit vector elements 0, 1, 4 and 5 from each of the
static __inline __m256 __DEFAULT_FN_ATTRS
_mm256_unpacklo_ps(__m256 __a, __m256 __b)
{
-#ifdef __GNUC__
- return (__m256) __builtin_ia32_unpcklps256 ((__v8sf)__a, (__v8sf)__b);
-#else
return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
-#endif
}
/* Bit Test */
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VZEROALL </c> instruction.
-#ifdef __GNUC__
-static __inline void __DEFAULT_FN_ATTRS
-#else
static __inline void __attribute__((__always_inline__, __nodebug__, __target__("avx")))
-#endif
_mm256_zeroall(void)
{
__builtin_ia32_vzeroall();
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VZEROUPPER </c> instruction.
-//
-#ifdef __GNUC__
-static __inline void __DEFAULT_FN_ATTRS
-#else
static __inline void __attribute__((__always_inline__, __nodebug__, __target__("avx")))
-#endif
_mm256_zeroupper(void)
{
__builtin_ia32_vzeroupper();
static __inline __m256d __DEFAULT_FN_ATTRS
_mm256_broadcast_pd(__m128d const *__a)
{
-#ifdef __GNUC__
- return (__m256d) __builtin_ia32_vbroadcastf128_pd256 (__a);
-#else
__m128d __b = _mm_loadu_pd((const double *)__a);
return (__m256d)__builtin_shufflevector((__v2df)__b, (__v2df)__b,
0, 1, 0, 1);
-#endif
}
/// Loads the data from a 128-bit vector of [4 x float] from the
static __inline __m256 __DEFAULT_FN_ATTRS
_mm256_broadcast_ps(__m128 const *__a)
{
-#ifdef __GNUC__
- return (__m256) __builtin_ia32_vbroadcastf128_ps256 (__a);
-#else
__m128 __b = _mm_loadu_ps((const float *)__a);
return (__m256)__builtin_shufflevector((__v4sf)__b, (__v4sf)__b,
0, 1, 2, 3, 0, 1, 2, 3);
-#endif
}
/* SIMD load ops */
static __inline void __DEFAULT_FN_ATTRS
_mm256_stream_si256(__m256i *__a, __m256i __b)
{
-#ifdef __GNUC__
- __builtin_ia32_movntdq256 ((__v4di *)__a, (__v4di)__b);
-#else
typedef __v4di __v4di_aligned __attribute__((aligned(32)));
__builtin_nontemporal_store((__v4di_aligned)__b, (__v4di_aligned*)__a);
-#endif
}
/// Moves double-precision values from a 256-bit vector of [4 x double]
static __inline void __DEFAULT_FN_ATTRS
_mm256_stream_pd(double *__a, __m256d __b)
{
-#ifdef __GNUC__
- __builtin_ia32_movntpd256 (__a, (__v4df)__b);
-#else
typedef __v4df __v4df_aligned __attribute__((aligned(32)));
__builtin_nontemporal_store((__v4df_aligned)__b, (__v4df_aligned*)__a);
-#endif
}
/// Moves single-precision floating point values from a 256-bit vector
static __inline void __DEFAULT_FN_ATTRS
_mm256_stream_ps(float *__p, __m256 __a)
{
-#ifdef __GNUC__
- __builtin_ia32_movntps256 (__p, (__v8sf)__a);
-#else
typedef __v8sf __v8sf_aligned __attribute__((aligned(32)));
__builtin_nontemporal_store((__v8sf_aligned)__a, (__v8sf_aligned*)__p);
-#endif
}
/* Create vectors */
static __inline__ __m256d __DEFAULT_FN_ATTRS
_mm256_undefined_pd(void)
{
-#ifdef __GNUC__
- __m256d __X = __X;
- return __X;
-#else
return (__m256d)__builtin_ia32_undef256();
-#endif
}
/// Create a 256-bit vector of [8 x float] with undefined values.
static __inline__ __m256 __DEFAULT_FN_ATTRS
_mm256_undefined_ps(void)
{
-#ifdef __GNUC__
- __m256 __X = __X;
- return __X;
-#else
return (__m256)__builtin_ia32_undef256();
-#endif
}
/// Create a 256-bit integer vector with undefined values.
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_undefined_si256(void)
{
-#ifdef __GNUC__
- __m256i __X = __X;
- return __X;
-#else
return (__m256i)__builtin_ia32_undef256();
-#endif
}
/// Constructs a 256-bit floating-point vector of [4 x double]
static __inline __m128d __DEFAULT_FN_ATTRS
_mm256_castpd256_pd128(__m256d __a)
{
-#ifdef __GNUC__
- return (__m128d) __builtin_ia32_pd_pd256 ((__v4df)__a);
-#else
return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 1);
-#endif
}
/// Returns the lower 128 bits of a 256-bit floating-point vector of
static __inline __m128 __DEFAULT_FN_ATTRS
_mm256_castps256_ps128(__m256 __a)
{
-#ifdef __GNUC__
- return (__m128) __builtin_ia32_ps_ps256 ((__v8sf)__a);
-#else
return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 1, 2, 3);
-#endif
}
/// Truncates a 256-bit integer vector into a 128-bit integer vector.
static __inline __m128i __DEFAULT_FN_ATTRS
_mm256_castsi256_si128(__m256i __a)
{
-#ifdef __GNUC__
- return (__m128i) __builtin_ia32_si_si256 ((__v8si)__a);
-#else
return __builtin_shufflevector((__v4di)__a, (__v4di)__a, 0, 1);
-#endif
}
/// Constructs a 256-bit floating-point vector of [4 x double] from a
static __inline __m256d __DEFAULT_FN_ATTRS
_mm256_castpd128_pd256(__m128d __a)
{
-#ifdef __GNUC__
- return (__m256d) __builtin_ia32_pd256_pd ((__v2df)__a);
-#else
return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 1, -1, -1);
-#endif
}
/// Constructs a 256-bit floating-point vector of [8 x float] from a
static __inline __m256 __DEFAULT_FN_ATTRS
_mm256_castps128_ps256(__m128 __a)
{
-#ifdef __GNUC__
- return (__m256) __builtin_ia32_ps256_ps ((__v4sf)__a);
-#else
return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1, 2, 3, -1, -1, -1, -1);
-#endif
}
/// Constructs a 256-bit integer vector from a 128-bit integer vector.
static __inline __m256i __DEFAULT_FN_ATTRS
_mm256_castsi128_si256(__m128i __a)
{
-#ifdef __GNUC__
- return (__m256i) __builtin_ia32_si256_si ((__v4si)__a);
-#else
return __builtin_shufflevector((__v2di)__a, (__v2di)__a, 0, 1, -1, -1);
-#endif
+}
+
+/// Constructs a 256-bit floating-point vector of [4 x double] from a
+/// 128-bit floating-point vector of [2 x double]. The lower 128 bits
+/// contain the value of the source vector. The upper 128 bits are set
+/// to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double].
+/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
+/// contain the value of the parameter. The upper 128 bits are set to zero.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_zextpd128_pd256(__m128d __a)
+{
+ return __builtin_shufflevector((__v2df)__a, (__v2df)_mm_setzero_pd(), 0, 1, 2, 3);
+}
+
+/// Constructs a 256-bit floating-point vector of [8 x float] from a
+/// 128-bit floating-point vector of [4 x float]. The lower 128 bits contain
+/// the value of the source vector. The upper 128 bits are set to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float].
+/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
+/// contain the value of the parameter. The upper 128 bits are set to zero.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_zextps128_ps256(__m128 __a)
+{
+ return __builtin_shufflevector((__v4sf)__a, (__v4sf)_mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7);
+}
+
+/// Constructs a 256-bit integer vector from a 128-bit integer vector.
+/// The lower 128 bits contain the value of the source vector. The upper
+/// 128 bits are set to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+/// A 128-bit integer vector.
+/// \returns A 256-bit integer vector. The lower 128 bits contain the value of
+/// the parameter. The upper 128 bits are set to zero.
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_zextsi128_si256(__m128i __a)
+{
+ return __builtin_shufflevector((__v2di)__a, (__v2di)_mm_setzero_si128(), 0, 1, 2, 3);
}
/*
static __inline __m256 __DEFAULT_FN_ATTRS
_mm256_set_m128 (__m128 __hi, __m128 __lo)
{
-#ifdef __GNUC__
- return _mm256_insertf128_ps (_mm256_castps128_ps256 (__lo), __hi, 1);
-#else
return (__m256) __builtin_shufflevector((__v4sf)__lo, (__v4sf)__hi, 0, 1, 2, 3, 4, 5, 6, 7);
-#endif
}
/// Constructs a 256-bit floating-point vector of [4 x double] by
static __inline __m256d __DEFAULT_FN_ATTRS
_mm256_set_m128d (__m128d __hi, __m128d __lo)
{
-#ifdef __GNUC__
- return (__m256d) _mm256_insertf128_pd (_mm256_castpd128_pd256 (__lo), __hi, 1);
-#else
return (__m256d) __builtin_shufflevector((__v2df)__lo, (__v2df)__hi, 0, 1, 2, 3);
-#endif
}
/// Constructs a 256-bit integer vector by concatenating two 128-bit
static __inline __m256i __DEFAULT_FN_ATTRS
_mm256_set_m128i (__m128i __hi, __m128i __lo)
{
-#ifdef __GNUC__
- return (__m256i) _mm256_insertf128_si256 (_mm256_castsi128_si256 (__lo), __hi, 1);
-#else
return (__m256i) __builtin_shufflevector((__v2di)__lo, (__v2di)__hi, 0, 1, 2, 3);
-#endif
}
/// Constructs a 256-bit floating-point vector of [8 x float] by
typedef signed char __v16qs __attribute__((__vector_size__(16)));
/* Define the default attributes for the functions in this file. */
-#ifdef __GNUC__
-#define __DEFAULT_FN_ATTRS __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-#define __DEFAULT_FN_ATTRS_MMX __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-#else
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse2"), __min_vector_width__(128)))
#define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, __nodebug__, __target__("mmx,sse2"), __min_vector_width__(64)))
-#endif
-
-#define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
/// Adds lower double-precision values in both operands and returns the
/// sum in the lower 64 bits of the result. The upper 64 bits of the result
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cvtps_pd(__m128 __a)
{
-#ifdef __GNUC__
- return (__m128d)__builtin_ia32_cvtps2pd ((__v4sf) __a);
-#else
return (__m128d) __builtin_convertvector(
__builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1), __v2df);
-#endif
}
/// Converts the lower two integer elements of a 128-bit vector of
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cvtepi32_pd(__m128i __a)
{
-#ifdef __GNUC__
- return (__m128d)__builtin_ia32_cvtdq2pd ((__v4si) __a);
-#else
return (__m128d) __builtin_convertvector(
__builtin_shufflevector((__v4si)__a, (__v4si)__a, 0, 1), __v2df);
-#endif
}
/// Converts the two double-precision floating-point elements of a
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cvtpd_epi32(__m128d __a)
{
-#ifdef __GNUC__
- return (__m128i)__builtin_ia32_cvtpd2dq ((__v2df) __a);
-#else
return __builtin_ia32_cvtpd2dq((__v2df)__a);
-#endif
}
/// Converts the low-order element of a 128-bit vector of [2 x double]
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_loadr_pd(double const *__dp)
{
-#ifdef __GNUC__
- __m128d __tmp = _mm_load_pd (__dp);
- return __builtin_ia32_shufpd (__tmp, __tmp, _MM_SHUFFLE2 (0,1));
-#else
__m128d __u = *(__m128d*)__dp;
return __builtin_shufflevector((__v2df)__u, (__v2df)__u, 1, 0);
-#endif
}
/// Loads a 128-bit floating-point vector of [2 x double] from an
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_undefined_pd(void)
{
-#ifdef __GNUC__
- __m128d __X = __X;
- return __X;
-#else
return (__m128d)__builtin_ia32_undef128();
-#endif
}
/// Constructs a 128-bit floating-point vector of [2 x double]. The lower
static __inline__ void __DEFAULT_FN_ATTRS
_mm_store1_pd(double *__dp, __m128d __a)
{
-#ifdef __GNUC__
- _mm_store_pd (__dp, __builtin_ia32_shufpd (__a, __a, _MM_SHUFFLE2 (0,0)));
-#else
__a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
_mm_store_pd(__dp, __a);
-#endif
}
/// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
static __inline__ void __DEFAULT_FN_ATTRS
_mm_storer_pd(double *__dp, __m128d __a)
{
-#ifdef __GNUC__
- _mm_store_pd (__dp, __builtin_ia32_shufpd (__a, __a, _MM_SHUFFLE2 (0,1)));
-#else
__a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 1, 0);
*(__m128d *)__dp = __a;
-#endif
}
/// Stores the upper 64 bits of a 128-bit vector of [2 x double] to a
/// \param __b
/// A 64-bit integer.
/// \returns A 64-bit integer containing the sum of both parameters.
-#ifndef __GNUC__
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_add_si64(__m64 __a, __m64 __b)
{
return (__m64)__builtin_ia32_paddq((__v1di)__a, (__v1di)__b);
}
-#endif
/// Adds the corresponding elements of two 128-bit vectors of [2 x i64],
/// saving the lower 64 bits of each sum in the corresponding element of a
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_avg_epu8(__m128i __a, __m128i __b)
{
-#ifdef __GNUC__
- return (__m128i)__builtin_ia32_pavgb128 ((__v16qi)__a, (__v16qi)__b);
-#else
typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32)));
return (__m128i)__builtin_convertvector(
((__builtin_convertvector((__v16qu)__a, __v16hu) +
__builtin_convertvector((__v16qu)__b, __v16hu)) + 1)
>> 1, __v16qu);
-#endif
}
/// Computes the rounded avarages of corresponding elements of two
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_avg_epu16(__m128i __a, __m128i __b)
{
-#ifdef __GNUC__
- return (__m128i)__builtin_ia32_pavgw128 ((__v8hi)__a, (__v8hi)__b);
-#else
typedef unsigned int __v8su __attribute__ ((__vector_size__ (32)));
return (__m128i)__builtin_convertvector(
((__builtin_convertvector((__v8hu)__a, __v8su) +
__builtin_convertvector((__v8hu)__b, __v8su)) + 1)
>> 1, __v8hu);
-#endif
}
/// Multiplies the corresponding elements of two 128-bit signed [8 x i16]
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_mul_su32(__m64 __a, __m64 __b)
{
-#ifdef __GNUC__
- return (__m64)__builtin_ia32_pmuludq ((__v2si)__a, (__v2si)__b);
-#else
return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b);
-#endif
}
/// Multiplies 32-bit unsigned integer values contained in the lower
/// A 64-bit integer vector containing the subtrahend.
/// \returns A 64-bit integer vector containing the difference of the values in
/// the operands.
-
-#ifndef __GNUC__
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_sub_si64(__m64 __a, __m64 __b)
{
return (__m64)__builtin_ia32_psubq((__v1di)__a, (__v1di)__b);
}
-#endif
/// Subtracts the corresponding elements of two [2 x i64] vectors.
///
/// An immediate value specifying the number of bytes to right-shift operand
/// \a a.
/// \returns A 128-bit integer vector containing the right-shifted value.
-#ifdef __GNUC__
-#define _mm_bsrli_si128(a, n) \
- ((__m128i)__builtin_ia32_psrldqi128 ((__m128i)(a), (int)(n) * 8))
-#define _mm_srli_si128(a, n) \
- ((__m128i)__builtin_ia32_psrldqi128 ((__m128i)(a), (int)(n) * 8))
-#else
#define _mm_srli_si128(a, imm) \
(__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm))
+
#define _mm_bsrli_si128(a, imm) \
(__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm))
-#endif
-
/// Right-shifts each of 16-bit values in the 128-bit integer vector
/// operand by the specified number of bits. High-order bits are cleared.
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cvtepi32_ps(__m128i __a)
{
-#ifdef __GNUC__
- return (__m128)__builtin_ia32_cvtdq2ps ((__v4si) __a);
-#else
return (__m128)__builtin_convertvector((__v4si)__a, __v4sf);
-#endif
}
/// Converts a vector of [4 x float] into a vector of [4 x i32].
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_undefined_si128(void)
{
-#ifdef __GNUC__
- __m128i __X = __X;
- return __X;
-#else
return (__m128i)__builtin_ia32_undef128();
-#endif
}
/// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
static __inline__ void __DEFAULT_FN_ATTRS
_mm_stream_pd(double *__p, __m128d __a)
{
-#ifdef __GNUC__
- __builtin_ia32_movntpd (__p, (__v2df)__a);
-#else
__builtin_nontemporal_store((__v2df)__a, (__v2df*)__p);
-#endif
}
/// Stores a 128-bit integer vector to a 128-bit aligned memory location.
static __inline__ void __DEFAULT_FN_ATTRS
_mm_stream_si128(__m128i *__p, __m128i __a)
{
-#ifdef __GNUC__
- __builtin_ia32_movntdq ((__v2di *)__p, (__v2di)__a);
-#else
__builtin_nontemporal_store((__v2di)__a, (__v2di*)__p);
-#endif
}
/// Stores a 32-bit integer value in the specified memory location.
/// A pointer to the 32-bit memory location used to store the value.
/// \param __a
/// A 32-bit integer containing the value to be stored.
-static __inline__ void
-#ifdef __GNUC__
-__attribute__((__gnu_inline__, __always_inline__, __artificial__))
-#else
-__attribute__((__always_inline__, __nodebug__, __target__("sse2")))
-#endif
+static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
_mm_stream_si32(int *__p, int __a)
{
__builtin_ia32_movnti(__p, __a);
/// A pointer to the 64-bit memory location used to store the value.
/// \param __a
/// A 64-bit integer containing the value to be stored.
-static __inline__ void
-#ifdef __GNUC__
-__attribute__((__gnu_inline__, __always_inline__, __artificial__))
-#else
-__attribute__((__always_inline__, __nodebug__, __target__("sse2")))
-#endif
+static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
_mm_stream_si64(long long *__p, long long __a)
{
__builtin_ia32_movnti64(__p, __a);
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_unpackhi_epi8(__m128i __a, __m128i __b)
{
-#ifdef __GNUC__
- return (__m128i)__builtin_ia32_punpckhbw128 ((__v16qi)__a, (__v16qi)__b);
-#else
return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
-#endif
}
/// Unpacks the high-order (index 4-7) values from two 128-bit vectors of
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_unpackhi_epi16(__m128i __a, __m128i __b)
{
-#ifdef __GNUC__
- return (__m128i)__builtin_ia32_punpckhwd128 ((__v8hi)__a, (__v8hi)__b);
-#else
return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
-#endif
}
/// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_unpackhi_epi32(__m128i __a, __m128i __b)
{
-#ifdef __GNUC__
- return (__m128i)__builtin_ia32_punpckhdq128 ((__v4si)__a, (__v4si)__b);
-#else
return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4+2, 3, 4+3);
-#endif
}
/// Unpacks the high-order 64-bit elements from two 128-bit vectors of
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_unpackhi_epi64(__m128i __a, __m128i __b)
{
-#ifdef __GNUC__
- return (__m128i)__builtin_ia32_punpckhqdq128 ((__v2di)__a, (__v2di)__b);
-#else
return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 1, 2+1);
-#endif
}
/// Unpacks the low-order (index 0-7) values from two 128-bit vectors of
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_unpacklo_epi8(__m128i __a, __m128i __b)
{
-#ifdef __GNUC__
- return (__m128i)__builtin_ia32_punpcklbw128 ((__v16qi)__a, (__v16qi)__b);
-#else
return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7);
-#endif
}
/// Unpacks the low-order (index 0-3) values from each of the two 128-bit
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_unpacklo_epi16(__m128i __a, __m128i __b)
{
-#ifdef __GNUC__
- return (__m128i)__builtin_ia32_punpcklwd128 ((__v8hi)__a, (__v8hi)__b);
-#else
return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
-#endif
}
/// Unpacks the low-order (index 0,1) values from two 128-bit vectors of
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_unpacklo_epi32(__m128i __a, __m128i __b)
{
-#ifdef __GNUC__
- return (__m128i)__builtin_ia32_punpckldq128 ((__v4si)__a, (__v4si)__b);
-#else
return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4+0, 1, 4+1);
-#endif
}
/// Unpacks the low-order 64-bit elements from two 128-bit vectors of
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_unpacklo_epi64(__m128i __a, __m128i __b)
{
-#ifdef __GNUC__
- return (__m128i)__builtin_ia32_punpcklqdq128 ((__v2di)__a, (__v2di)__b);
-#else
return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 0, 2+0);
-#endif
}
/// Returns the lower 64 bits of a 128-bit integer vector as a 64-bit
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_move_epi64(__m128i __a)
{
-#ifdef __GNUC__
- return (__m128i)__builtin_ia32_movq128 ((__v2di) __a);
-#else
return __builtin_shufflevector((__v2di)__a, _mm_setzero_si128(), 0, 2);
-#endif
}
/// Unpacks the high-order 64-bit elements from two 128-bit vectors of
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_unpackhi_pd(__m128d __a, __m128d __b)
{
-#ifdef __GNUC__
- return (__m128d)__builtin_ia32_unpckhpd ((__v2df)__a, (__v2df)__b);
-#else
return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 1, 2+1);
-#endif
}
/// Unpacks the low-order 64-bit elements from two 128-bit vectors
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_unpacklo_pd(__m128d __a, __m128d __b)
{
-#ifdef __GNUC__
- return (__m128d)__builtin_ia32_unpcklpd ((__v2df)__a, (__v2df)__b);
-#else
return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 0, 2+0);
-#endif
}
/// Extracts the sign bits of the double-precision values in the 128-bit
extern "C" {
#endif
+/// Indicates that a spin loop is being executed for the purposes of
+/// optimizing power consumption during the loop.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PAUSE </c> instruction.
+///
+void _mm_pause(void);
+
#if defined(__cplusplus)
} // extern "C"
#endif
#undef __DEFAULT_FN_ATTRS
#undef __DEFAULT_FN_ATTRS_MMX
-#ifndef _MM_DENORMALS_ZERO_ON
+#define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
+
#define _MM_DENORMALS_ZERO_ON (0x0040)
-#endif
-#ifndef _MM_DENORMALS_ZERO_OFF
#define _MM_DENORMALS_ZERO_OFF (0x0000)
-#endif
-#ifndef _MM_DENORMALS_ZERO_MASK
#define _MM_DENORMALS_ZERO_MASK (0x0040)
-#endif
-#ifndef _MM_GET_DENORMALS_ZERO_MODE
#define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK)
#define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x)))
-#endif
#endif /* __EMMINTRIN_H */
#ifndef __MMINTRIN_H
#define __MMINTRIN_H
-#ifdef __GNUC__
-typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
-#else
typedef long long __m64 __attribute__((__vector_size__(8)));
-#endif
-typedef int __m64_u __attribute__ ((__vector_size__ (8), __may_alias__, __aligned__ (1)));
typedef long long __v1di __attribute__((__vector_size__(8)));
typedef int __v2si __attribute__((__vector_size__(8)));
typedef short __v4hi __attribute__((__vector_size__(8)));
typedef char __v8qi __attribute__((__vector_size__(8)));
-typedef float __v2sf __attribute__ ((__vector_size__ (8)));
+
/* Define the default attributes for the functions in this file. */
-#ifdef __GNUC__
-#define __DEFAULT_FN_ATTRS __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-#else
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("mmx"), __min_vector_width__(64)))
-#endif
/// Clears the MMX state by setting the state of the x87 stack registers
/// to empty.
///
/// This intrinsic corresponds to the <c> EMMS </c> instruction.
///
-static __inline__ void
-#ifdef __GNUC__
-__attribute__((__gnu_inline__, __always_inline__, __artificial__))
-#else
-__attribute__((__always_inline__, __nodebug__, __target__("mmx")))
-#endif
+static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("mmx")))
_mm_empty(void)
{
__builtin_ia32_emms();
static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_sll_pi16(__m64 __m, __m64 __count)
{
-#ifdef __GNUC__
- return (__m64) __builtin_ia32_psllw ((__v4hi)__m, (__v4hi)__count);
-#else
return (__m64)__builtin_ia32_psllw((__v4hi)__m, __count);
-#endif
}
/// Left-shifts each 16-bit signed integer element of a 64-bit integer
static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_sll_pi32(__m64 __m, __m64 __count)
{
- return (__m64)__builtin_ia32_pslld((__v2si)__m, (__v2si)__count);
+ return (__m64)__builtin_ia32_pslld((__v2si)__m, __count);
}
/// Left-shifts each 32-bit signed integer element of a 64-bit integer
static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_sll_si64(__m64 __m, __m64 __count)
{
- return (__m64)__builtin_ia32_psllq((__v1di)__m, (__v1di)__count);
+ return (__m64)__builtin_ia32_psllq((__v1di)__m, __count);
}
/// Left-shifts the first parameter, which is a 64-bit integer, by the
static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_sra_pi16(__m64 __m, __m64 __count)
{
- return (__m64)__builtin_ia32_psraw((__v4hi)__m, (__v4hi)__count);
+ return (__m64)__builtin_ia32_psraw((__v4hi)__m, __count);
}
/// Right-shifts each 16-bit integer element of a 64-bit integer vector
static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_sra_pi32(__m64 __m, __m64 __count)
{
- return (__m64)__builtin_ia32_psrad((__v2si)__m, (__v2si)__count);
+ return (__m64)__builtin_ia32_psrad((__v2si)__m, __count);
}
/// Right-shifts each 32-bit integer element of a 64-bit integer vector
static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_srl_pi16(__m64 __m, __m64 __count)
{
- return (__m64)__builtin_ia32_psrlw((__v4hi)__m, (__v4hi)__count);
+ return (__m64)__builtin_ia32_psrlw((__v4hi)__m, __count);
}
/// Right-shifts each 16-bit integer element of a 64-bit integer vector
static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_srl_pi32(__m64 __m, __m64 __count)
{
- return (__m64)__builtin_ia32_psrld((__v2si)__m, (__v2si)__count);
+ return (__m64)__builtin_ia32_psrld((__v2si)__m, __count);
}
/// Right-shifts each 32-bit integer element of a 64-bit integer vector
static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_srl_si64(__m64 __m, __m64 __count)
{
- return (__m64)__builtin_ia32_psrlq((__v1di)__m, (__v1di)__count);
+ return (__m64)__builtin_ia32_psrlq((__v1di)__m, __count);
}
/// Right-shifts the first parameter, which is a 64-bit integer, by the
static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_and_si64(__m64 __m1, __m64 __m2)
{
- return __builtin_ia32_pand(__m1, __m2);
+ return __builtin_ia32_pand((__v1di)__m1, (__v1di)__m2);
}
/// Performs a bitwise NOT of the first 64-bit integer vector, and then
static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_andnot_si64(__m64 __m1, __m64 __m2)
{
-#ifdef __GNUC__
- return __builtin_ia32_pandn (__m1, __m2);
-#else
return __builtin_ia32_pandn((__v1di)__m1, (__v1di)__m2);
-#endif
}
/// Performs a bitwise OR of two 64-bit integer vectors.
static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_or_si64(__m64 __m1, __m64 __m2)
{
-#ifdef __GNUC__
- return __builtin_ia32_por(__m1, __m2);
-#else
return __builtin_ia32_por((__v1di)__m1, (__v1di)__m2);
-#endif
}
/// Performs a bitwise exclusive OR of two 64-bit integer vectors.
static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_xor_si64(__m64 __m1, __m64 __m2)
{
- return __builtin_ia32_pxor (__m1, __m2);
+ return __builtin_ia32_pxor((__v1di)__m1, (__v1di)__m2);
}
/// Compares the 8-bit integer elements of two 64-bit integer vectors of
#include <emmintrin.h>
/* Define the default attributes for the functions in this file. */
-#ifdef __GNUC__
-#define __DEFAULT_FN_ATTRS \
- __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-#else
#define __DEFAULT_FN_ATTRS \
__attribute__((__always_inline__, __nodebug__, __target__("sse3"), __min_vector_width__(128)))
-#endif
/// Loads data from an unaligned memory location to elements in a 128-bit
/// vector.
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_movehdup_ps(__m128 __a)
{
-#ifdef __GNUC__
- return (__m128) __builtin_ia32_movshdup ((__v4sf)__a);
-#else
return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 1, 1, 3, 3);
-#endif
}
/// Duplicates even-indexed values from a 128-bit vector of
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_moveldup_ps(__m128 __a)
{
-#ifdef __GNUC__
- return (__m128) __builtin_ia32_movsldup ((__v4sf)__a);
-#else
return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 2, 2);
-#endif
}
/// Adds the even-indexed values and subtracts the odd-indexed values of
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_movedup_pd(__m128d __a)
{
-#ifdef __GNUC__
- return _mm_shuffle_pd (__a, __a, _MM_SHUFFLE2 (0,0));
-#else
return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
-#endif
}
/// Establishes a linear address memory range to be monitored and puts
#define __POPCNTINTRIN_H
/* Define the default attributes for the functions in this file. */
-#ifdef __GNUC__
-#define __DEFAULT_FN_ATTRS __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-#else
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("popcnt")))
-#endif
/// Counts the number of bits in the source operand having a value of 1.
///
#include <tmmintrin.h>
/* Define the default attributes for the functions in this file. */
-#ifdef __GNUC__
-#define __DEFAULT_FN_ATTRS __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-#else
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4.1"), __min_vector_width__(128)))
-#endif
/* SSE4 Rounding macros. */
#define _MM_FROUND_TO_NEAREST_INT 0x00
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_stream_load_si128 (__m128i const *__V)
{
-#ifdef __GNUC__
- return (__m128i) __builtin_ia32_movntdqa ((__v2di *) __V);
-#else
return (__m128i) __builtin_nontemporal_load ((const __v2di *) __V);
-#endif
}
/* SSE4 Packed Integer Min/Max Instructions. */
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cvtepi8_epi16(__m128i __V)
{
-#ifdef __GNUC__
- return (__m128i) __builtin_ia32_pmovsxbw128 ((__v16qi)__V);
-#else
/* This function always performs a signed extension, but __v16qi is a char
which may be signed or unsigned, so use __v16qs. */
return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi);
-#endif
}
/// Sign-extends each of the lower four 8-bit integer elements of a
{
/* This function always performs a signed extension, but __v16qi is a char
which may be signed or unsigned, so use __v16qs. */
-#ifdef __GNUC__
- return (__m128i) __builtin_ia32_pmovsxbd128 ((__v16qi)__V);
-#else
return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4si);
-#endif
}
/// Sign-extends each of the lower two 8-bit integer elements of a
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cvtepi8_epi64(__m128i __V)
{
-#ifdef __GNUC__
- return (__m128i) __builtin_ia32_pmovsxbq128 ((__v16qi)__V);
-#else
/* This function always performs a signed extension, but __v16qi is a char
which may be signed or unsigned, so use __v16qs. */
return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1), __v2di);
-#endif
}
/// Sign-extends each of the lower four 16-bit integer elements of a
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cvtepi16_epi32(__m128i __V)
{
-#ifdef __GNUC__
- return (__m128i) __builtin_ia32_pmovsxwd128 ((__v8hi)__V);
-#else
return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4si);
-#endif
}
/// Sign-extends each of the lower two 16-bit integer elements of a
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cvtepi16_epi64(__m128i __V)
{
-#ifdef __GNUC__
- return (__m128i) __builtin_ia32_pmovsxwq128 ((__v8hi)__V);
-#else
return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1), __v2di);
-#endif
}
/// Sign-extends each of the lower two 32-bit integer elements of a
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cvtepi32_epi64(__m128i __V)
{
-#ifdef __GNUC__
- return (__m128i) __builtin_ia32_pmovsxdq128 ((__v4si)__V);
-#else
return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v4si)__V, (__v4si)__V, 0, 1), __v2di);
-#endif
}
/* SSE4 Packed Integer Zero-Extension. */
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cvtepu8_epi16(__m128i __V)
{
-#ifdef __GNUC__
- return (__m128i) __builtin_ia32_pmovzxbw128 ((__v16qi)__V);
-#else
return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi);
-#endif
}
/// Zero-extends each of the lower four 8-bit integer elements of a
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cvtepu8_epi32(__m128i __V)
{
-#ifdef __GNUC__
- return (__m128i) __builtin_ia32_pmovzxbd128 ((__v16qi)__V);
-#else
return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4si);
-#endif
}
/// Zero-extends each of the lower two 8-bit integer elements of a
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cvtepu8_epi64(__m128i __V)
{
-#ifdef __GNUC__
- return (__m128i) __builtin_ia32_pmovzxbq128 ((__v16qi)__V);
-#else
return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1), __v2di);
-#endif
}
/// Zero-extends each of the lower four 16-bit integer elements of a
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cvtepu16_epi32(__m128i __V)
{
-#ifdef __GNUC__
- return (__m128i) __builtin_ia32_pmovzxwd128 ((__v8hi)__V);
-#else
return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4si);
-#endif
}
/// Zero-extends each of the lower two 16-bit integer elements of a
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cvtepu16_epi64(__m128i __V)
{
-#ifdef __GNUC__
- return (__m128i) __builtin_ia32_pmovzxwq128 ((__v8hi)__V);
-#else
return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1), __v2di);
-#endif
}
/// Zero-extends each of the lower two 32-bit integer elements of a
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cvtepu32_epi64(__m128i __V)
{
-#ifdef __GNUC__
- return (__m128i) __builtin_ia32_pmovzxdq128 ((__v4si)__V);
-#else
return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v4su)__V, (__v4su)__V, 0, 1), __v2di);
-#endif
}
/* SSE4 Pack with Unsigned Saturation. */
so we'll do the same. */
#undef __DEFAULT_FN_ATTRS
-#ifdef __GNUC__
-#define __DEFAULT_FN_ATTRS __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-#else
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4.2")))
-#endif
/* These specify the type of data that we're comparing. */
#define _SIDD_UBYTE_OPS 0x00
#include <pmmintrin.h>
/* Define the default attributes for the functions in this file. */
-#ifdef __GNUC__
-#define __DEFAULT_FN_ATTRS __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-#define __DEFAULT_FN_ATTRS_MMX __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-#else
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("ssse3"), __min_vector_width__(64)))
#define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, __nodebug__, __target__("mmx,ssse3"), __min_vector_width__(64)))
-#endif
/// Computes the absolute value of each of the packed 8-bit signed
/// integers in the source operand and stores the 8-bit unsigned integer
/// An immediate operand specifying how many bytes to right-shift the result.
/// \returns A 128-bit integer vector containing the concatenated right-shifted
/// value.
-#ifdef __GNUC__
-#define _mm_alignr_epi8(a, b, n) \
- (__m128i)__builtin_ia32_palignr128((__v2di)(__m128i)(a), \
- (__v2di)(__m128i)(b), (n))
-#else
#define _mm_alignr_epi8(a, b, n) \
(__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(a), \
(__v16qi)(__m128i)(b), (n))
-#endif
/// Concatenates the two 64-bit integer vector operands, and right-shifts
/// the result by the number of bytes specified in the immediate operand.
#define __XMMINTRIN_H
#include <mmintrin.h>
-/* Define the default attributes for the functions in this file. */
-#ifdef __GNUC__
-#define __DEFAULT_FN_ATTRS __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-#define __DEFAULT_FN_ATTRS_MMX __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-#else
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse2"), __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, __nodebug__, __target__("mmx,sse2"), __min_vector_width__(64)))
-#endif
-
typedef int __v4si __attribute__((__vector_size__(16)));
typedef float __v4sf __attribute__((__vector_size__(16)));
#include <mm_malloc.h>
#endif
-#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
-
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse"), __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, __nodebug__, __target__("mmx,sse"), __min_vector_width__(64)))
/// Adds the 32-bit float values in the low-order bits of the operands.
///
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpgt_ss(__m128 __a, __m128 __b)
{
-#ifdef __GNUC__
- return (__m128) __builtin_ia32_movss ((__v4sf) __a, (__v4sf)
- __builtin_ia32_cmpltss ((__v4sf) __b, (__v4sf) __a));
-#else
return (__m128)__builtin_shufflevector((__v4sf)__a,
(__v4sf)__builtin_ia32_cmpltss((__v4sf)__b, (__v4sf)__a),
4, 1, 2, 3);
-#endif
}
/// Compares each of the corresponding 32-bit float values of the
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpge_ss(__m128 __a, __m128 __b)
{
-#ifdef __GNUC__
- return (__m128) __builtin_ia32_movss ((__v4sf) __a, (__v4sf)
- __builtin_ia32_cmpless ((__v4sf) __b, (__v4sf) __a));
-#else
return (__m128)__builtin_shufflevector((__v4sf)__a,
(__v4sf)__builtin_ia32_cmpless((__v4sf)__b, (__v4sf)__a),
4, 1, 2, 3);
-#endif
}
/// Compares each of the corresponding 32-bit float values of the
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpngt_ss(__m128 __a, __m128 __b)
{
-
-#ifdef __GNUC__
- return (__m128) __builtin_ia32_movss ((__v4sf) __a, (__v4sf)
- __builtin_ia32_cmpnltss ((__v4sf) __b, (__v4sf) __a));
-#else
return (__m128)__builtin_shufflevector((__v4sf)__a,
(__v4sf)__builtin_ia32_cmpnltss((__v4sf)__b, (__v4sf)__a),
4, 1, 2, 3);
-#endif
}
/// Compares each of the corresponding 32-bit float values of the
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpnge_ss(__m128 __a, __m128 __b)
{
-#ifdef __GNUC__
- return (__m128) __builtin_ia32_movss ((__v4sf) __a, (__v4sf)
- __builtin_ia32_cmpnless ((__v4sf) __b, (__v4sf) __a));
-#else
return (__m128)__builtin_shufflevector((__v4sf)__a,
(__v4sf)__builtin_ia32_cmpnless((__v4sf)__b, (__v4sf)__a),
4, 1, 2, 3);
-#endif
}
/// Compares each of the corresponding 32-bit float values of the
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_loadh_pi(__m128 __a, const __m64 *__p)
{
-#ifdef __GNUC__
- return (__m128) __builtin_ia32_loadhps ((__v4sf)__a, (const __v2sf *)__p);
-#else
typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8)));
struct __mm_loadh_pi_struct {
__mm_loadh_pi_v2f32 __u;
__mm_loadh_pi_v2f32 __b = ((struct __mm_loadh_pi_struct*)__p)->__u;
__m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5);
-#endif
}
/// Loads two packed float values from the address \a __p into the
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_loadl_pi(__m128 __a, const __m64 *__p)
{
-#ifdef __GNUC__
- return (__m128) __builtin_ia32_loadlps ((__v4sf)__a, (const __v2sf *)__p);
-#else
typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8)));
struct __mm_loadl_pi_struct {
__mm_loadl_pi_v2f32 __u;
__mm_loadl_pi_v2f32 __b = ((struct __mm_loadl_pi_struct*)__p)->__u;
__m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3);
-#endif
}
/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_loadr_ps(const float *__p)
{
-#ifdef __GNUC__
- return (__m128) __builtin_ia32_shufps (*(__v4sf *)__p, *(__v4sf *)__p, _MM_SHUFFLE (0,1,2,3));
-#else
__m128 __a = _mm_load_ps(__p);
return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
-#endif
}
/// Create a 128-bit vector of [4 x float] with undefined values.
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_undefined_ps(void)
{
-#ifdef __GNUC__
- __m128 __X = __X;
- return __X;
-#else
return (__m128)__builtin_ia32_undef128();
-#endif
}
/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
static __inline__ void __DEFAULT_FN_ATTRS
_mm_storeh_pi(__m64 *__p, __m128 __a)
{
-#ifdef __GNUC__
- __builtin_ia32_storehps((__v2sf *)__p, (__v4sf)__a);
-#else
__builtin_ia32_storehps((__v2si *)__p, (__v4sf)__a);
-#endif
}
/// Stores the lower 64 bits of a 128-bit vector of [4 x float] to a
static __inline__ void __DEFAULT_FN_ATTRS
_mm_storel_pi(__m64 *__p, __m128 __a)
{
-#ifdef __GNUC__
- __builtin_ia32_storelps ((__v2sf *)__p, (__v4sf)__a);
-#else
__builtin_ia32_storelps((__v2si *)__p, (__v4sf)__a);
-#endif
}
/// Stores the lower 32 bits of a 128-bit vector of [4 x float] to a
static __inline__ void __DEFAULT_FN_ATTRS
_mm_store1_ps(float *__p, __m128 __a)
{
-#ifdef __GNUC__
- __a = (__m128)__builtin_ia32_shufps((__v4sf)__a, (__v4sf)__a, _MM_SHUFFLE (0,0,0,0));
-#else
- __a = (__m128)__builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 0, 0);
-#endif
+ __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 0, 0);
_mm_store_ps(__p, __a);
}
static __inline__ void __DEFAULT_FN_ATTRS
_mm_storer_ps(float *__p, __m128 __a)
{
-#ifdef __GNUC__
- __a = __builtin_ia32_shufps ((__v4sf)__a, (__v4sf)__a, _MM_SHUFFLE (0,1,2,3));
-#else
__a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
-#endif
_mm_store_ps(__p, __a);
}
static __inline__ void __DEFAULT_FN_ATTRS_MMX
_mm_stream_pi(__m64 *__p, __m64 __a)
{
-#ifdef __GNUC__
- __builtin_ia32_movntq ((unsigned long long *)__p, (unsigned long long)__a);
-#else
__builtin_ia32_movntq(__p, __a);
-#endif
}
/// Moves packed float values from a 128-bit vector of [4 x float] to a
static __inline__ void __DEFAULT_FN_ATTRS
_mm_stream_ps(float *__p, __m128 __a)
{
-#ifdef __GNUC__
- __builtin_ia32_movntps (__p, (__v4sf)__a);
-#else
__builtin_nontemporal_store((__v4sf)__a, (__v4sf*)__p);
-#endif
}
#if defined(__cplusplus)
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_unpackhi_ps(__m128 __a, __m128 __b)
{
-#ifdef __GNUC__
- return (__m128) __builtin_ia32_unpckhps ((__v4sf)__a, (__v4sf)__b);
-#else
return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 2, 6, 3, 7);
-#endif
}
/// Unpacks the low-order (index 0,1) values from two 128-bit vectors of
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_unpacklo_ps(__m128 __a, __m128 __b)
{
-#ifdef __GNUC__
- return (__m128) __builtin_ia32_unpcklps ((__v4sf)__a, (__v4sf)__b);
-#else
return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 4, 1, 5);
-#endif
}
/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_movehl_ps(__m128 __a, __m128 __b)
{
-#ifdef __GNUC__
- return (__m128) __builtin_ia32_movhlps ((__v4sf)__a, (__v4sf)__b);
-#else
return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 6, 7, 2, 3);
-#endif
}
/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_movelh_ps(__m128 __a, __m128 __b)
{
-#ifdef __GNUC__
- return (__m128) __builtin_ia32_movlhps ((__v4sf)__a, (__v4sf)__b);
-#else
return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 1, 4, 5);
-#endif
}
/// Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x
#define _MM_ALIGN16 __attribute__((aligned(16)))
+#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
#define _MM_EXCEPT_INVALID (0x0001)
#define _MM_EXCEPT_DENORM (0x0002)
#define _m_ _mm_
#define _m_ _mm_
-/// Indicates that a spin loop is being executed for the purposes of
-/// optimizing power consumption during the loop.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PAUSE </c> instruction.
-///
-static inline void __DEFAULT_FN_ATTRS _mm_pause(void)
-{
- __builtin_ia32_pause();
-}
-
#undef __DEFAULT_FN_ATTRS
#undef __DEFAULT_FN_ATTRS_MMX
-/* Set the control register to I. */
-extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_setcsr (unsigned int __I)
-{
- __builtin_ia32_ldmxcsr (__I);
-}
-
-/* Return the contents of the control register. */
-extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_getcsr (void)
-{
- return __builtin_ia32_stmxcsr ();
-}
-
/* Ugly hack for backwards-compatibility (compatible with gcc) */
-#ifdef __GNUC__
-#include <emmintrin.h>
-#else
#if defined(__SSE2__) && !__building_module(_Builtin_intrinsics)
#include <emmintrin.h>
#endif
-#endif
#endif /* __XMMINTRIN_H */